csv/reader.rs
1use std::{
2 fs::File,
3 io::{self, BufRead, Seek},
4 marker::PhantomData,
5 path::Path,
6 result,
7};
8
9use {
10 csv_core::{Reader as CoreReader, ReaderBuilder as CoreReaderBuilder},
11 serde::de::DeserializeOwned,
12};
13
14use crate::{
15 byte_record::{ByteRecord, Position},
16 error::{Error, ErrorKind, Result, Utf8Error},
17 string_record::StringRecord,
18 {Terminator, Trim},
19};
20
21/// Builds a CSV reader with various configuration knobs.
22///
23/// This builder can be used to tweak the field delimiter, record terminator
24/// and more. Once a CSV `Reader` is built, its configuration cannot be
25/// changed.
26#[derive(Debug)]
27pub struct ReaderBuilder {
28 capacity: usize,
29 flexible: bool,
30 has_headers: bool,
31 trim: Trim,
32 /// The underlying CSV parser builder.
33 ///
34 /// We explicitly put this on the heap because CoreReaderBuilder embeds an
35 /// entire DFA transition table, which along with other things, tallies up
36 /// to almost 500 bytes on the stack.
37 builder: Box<CoreReaderBuilder>,
38}
39
40impl Default for ReaderBuilder {
41 fn default() -> ReaderBuilder {
42 ReaderBuilder {
43 capacity: 8 * (1 << 10),
44 flexible: false,
45 has_headers: true,
46 trim: Trim::default(),
47 builder: Box::new(CoreReaderBuilder::default()),
48 }
49 }
50}
51
52impl ReaderBuilder {
53 /// Create a new builder for configuring CSV parsing.
54 ///
55 /// To convert a builder into a reader, call one of the methods starting
56 /// with `from_`.
57 ///
58 /// # Example
59 ///
60 /// ```
61 /// use std::error::Error;
62 /// use csv::{ReaderBuilder, StringRecord};
63 ///
64 /// # fn main() { example().unwrap(); }
65 /// fn example() -> Result<(), Box<dyn Error>> {
66 /// let data = "\
67 /// city,country,pop
68 /// Boston,United States,4628910
69 /// Concord,United States,42695
70 /// ";
71 /// let mut rdr = ReaderBuilder::new().from_reader(data.as_bytes());
72 ///
73 /// let records = rdr
74 /// .records()
75 /// .collect::<Result<Vec<StringRecord>, csv::Error>>()?;
76 /// assert_eq!(records, vec![
77 /// vec!["Boston", "United States", "4628910"],
78 /// vec!["Concord", "United States", "42695"],
79 /// ]);
80 /// Ok(())
81 /// }
82 /// ```
83 pub fn new() -> ReaderBuilder {
84 ReaderBuilder::default()
85 }
86
87 /// Build a CSV parser from this configuration that reads data from the
88 /// given file path.
89 ///
90 /// If there was a problem opening the file at the given path, then this
91 /// returns the corresponding error.
92 ///
93 /// # Example
94 ///
95 /// ```no_run
96 /// use std::error::Error;
97 /// use csv::ReaderBuilder;
98 ///
99 /// # fn main() { example().unwrap(); }
100 /// fn example() -> Result<(), Box<dyn Error>> {
101 /// let mut rdr = ReaderBuilder::new().from_path("foo.csv")?;
102 /// for result in rdr.records() {
103 /// let record = result?;
104 /// println!("{:?}", record);
105 /// }
106 /// Ok(())
107 /// }
108 /// ```
109 pub fn from_path<P: AsRef<Path>>(&self, path: P) -> Result<Reader<File>> {
110 Ok(Reader::new(self, File::open(path)?))
111 }
112
113 /// Build a CSV parser from this configuration that reads data from `rdr`.
114 ///
115 /// Note that the CSV reader is buffered automatically, so you should not
116 /// wrap `rdr` in a buffered reader like `io::BufReader`.
117 ///
118 /// # Example
119 ///
120 /// ```
121 /// use std::error::Error;
122 /// use csv::ReaderBuilder;
123 ///
124 /// # fn main() { example().unwrap(); }
125 /// fn example() -> Result<(), Box<dyn Error>> {
126 /// let data = "\
127 /// city,country,pop
128 /// Boston,United States,4628910
129 /// Concord,United States,42695
130 /// ";
131 /// let mut rdr = ReaderBuilder::new().from_reader(data.as_bytes());
132 /// for result in rdr.records() {
133 /// let record = result?;
134 /// println!("{:?}", record);
135 /// }
136 /// Ok(())
137 /// }
138 /// ```
139 pub fn from_reader<R: io::Read>(&self, rdr: R) -> Reader<R> {
140 Reader::new(self, rdr)
141 }
142
143 /// The field delimiter to use when parsing CSV.
144 ///
145 /// The default is `b','`.
146 ///
147 /// # Example
148 ///
149 /// ```
150 /// use std::error::Error;
151 /// use csv::ReaderBuilder;
152 ///
153 /// # fn main() { example().unwrap(); }
154 /// fn example() -> Result<(), Box<dyn Error>> {
155 /// let data = "\
156 /// city;country;pop
157 /// Boston;United States;4628910
158 /// ";
159 /// let mut rdr = ReaderBuilder::new()
160 /// .delimiter(b';')
161 /// .from_reader(data.as_bytes());
162 ///
163 /// if let Some(result) = rdr.records().next() {
164 /// let record = result?;
165 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
166 /// Ok(())
167 /// } else {
168 /// Err(From::from("expected at least one record but got none"))
169 /// }
170 /// }
171 /// ```
172 pub fn delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder {
173 self.builder.delimiter(delimiter);
174 self
175 }
176
177 /// Whether to treat the first row as a special header row.
178 ///
179 /// By default, the first row is treated as a special header row, which
180 /// means the header is never returned by any of the record reading methods
181 /// or iterators. When this is disabled (`yes` set to `false`), the first
182 /// row is not treated specially.
183 ///
184 /// Note that the `headers` and `byte_headers` methods are unaffected by
185 /// whether this is set. Those methods always return the first record.
186 ///
187 /// # Example
188 ///
189 /// This example shows what happens when `has_headers` is disabled.
190 /// Namely, the first row is treated just like any other row.
191 ///
192 /// ```
193 /// use std::error::Error;
194 /// use csv::ReaderBuilder;
195 ///
196 /// # fn main() { example().unwrap(); }
197 /// fn example() -> Result<(), Box<dyn Error>> {
198 /// let data = "\
199 /// city,country,pop
200 /// Boston,United States,4628910
201 /// ";
202 /// let mut rdr = ReaderBuilder::new()
203 /// .has_headers(false)
204 /// .from_reader(data.as_bytes());
205 /// let mut iter = rdr.records();
206 ///
207 /// // Read the first record.
208 /// if let Some(result) = iter.next() {
209 /// let record = result?;
210 /// assert_eq!(record, vec!["city", "country", "pop"]);
211 /// } else {
212 /// return Err(From::from(
213 /// "expected at least two records but got none"));
214 /// }
215 ///
216 /// // Read the second record.
217 /// if let Some(result) = iter.next() {
218 /// let record = result?;
219 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
220 /// } else {
221 /// return Err(From::from(
222 /// "expected at least two records but got one"))
223 /// }
224 /// Ok(())
225 /// }
226 /// ```
227 pub fn has_headers(&mut self, yes: bool) -> &mut ReaderBuilder {
228 self.has_headers = yes;
229 self
230 }
231
232 /// Whether the number of fields in records is allowed to change or not.
233 ///
234 /// When disabled (which is the default), parsing CSV data will return an
235 /// error if a record is found with a number of fields different from the
236 /// number of fields in a previous record.
237 ///
238 /// When enabled, this error checking is turned off.
239 ///
240 /// # Example: flexible records enabled
241 ///
242 /// ```
243 /// use std::error::Error;
244 /// use csv::ReaderBuilder;
245 ///
246 /// # fn main() { example().unwrap(); }
247 /// fn example() -> Result<(), Box<dyn Error>> {
248 /// // Notice that the first row is missing the population count.
249 /// let data = "\
250 /// city,country,pop
251 /// Boston,United States
252 /// ";
253 /// let mut rdr = ReaderBuilder::new()
254 /// .flexible(true)
255 /// .from_reader(data.as_bytes());
256 ///
257 /// if let Some(result) = rdr.records().next() {
258 /// let record = result?;
259 /// assert_eq!(record, vec!["Boston", "United States"]);
260 /// Ok(())
261 /// } else {
262 /// Err(From::from("expected at least one record but got none"))
263 /// }
264 /// }
265 /// ```
266 ///
267 /// # Example: flexible records disabled
268 ///
269 /// This shows the error that appears when records of unequal length
270 /// are found and flexible records have been disabled (which is the
271 /// default).
272 ///
273 /// ```
274 /// use std::error::Error;
275 /// use csv::{ErrorKind, ReaderBuilder};
276 ///
277 /// # fn main() { example().unwrap(); }
278 /// fn example() -> Result<(), Box<dyn Error>> {
279 /// // Notice that the first row is missing the population count.
280 /// let data = "\
281 /// city,country,pop
282 /// Boston,United States
283 /// ";
284 /// let mut rdr = ReaderBuilder::new()
285 /// .flexible(false)
286 /// .from_reader(data.as_bytes());
287 ///
288 /// if let Some(Err(err)) = rdr.records().next() {
289 /// match *err.kind() {
290 /// ErrorKind::UnequalLengths { expected_len, len, .. } => {
291 /// // The header row has 3 fields...
292 /// assert_eq!(expected_len, 3);
293 /// // ... but the first row has only 2 fields.
294 /// assert_eq!(len, 2);
295 /// Ok(())
296 /// }
297 /// ref wrong => {
298 /// Err(From::from(format!(
299 /// "expected UnequalLengths error but got {:?}",
300 /// wrong)))
301 /// }
302 /// }
303 /// } else {
304 /// Err(From::from(
305 /// "expected at least one errored record but got none"))
306 /// }
307 /// }
308 /// ```
309 pub fn flexible(&mut self, yes: bool) -> &mut ReaderBuilder {
310 self.flexible = yes;
311 self
312 }
313
314 /// Whether fields are trimmed of leading and trailing whitespace or not.
315 ///
316 /// By default, no trimming is performed. This method permits one to
317 /// override that behavior and choose one of the following options:
318 ///
319 /// 1. `Trim::Headers` trims only header values.
320 /// 2. `Trim::Fields` trims only non-header or "field" values.
321 /// 3. `Trim::All` trims both header and non-header values.
322 ///
323 /// A value is only interpreted as a header value if this CSV reader is
324 /// configured to read a header record (which is the default).
325 ///
326 /// When reading string records, characters meeting the definition of
327 /// Unicode whitespace are trimmed. When reading byte records, characters
328 /// meeting the definition of ASCII whitespace are trimmed. ASCII
329 /// whitespace characters correspond to the set `[\t\n\v\f\r ]`.
330 ///
331 /// # Example
332 ///
333 /// This example shows what happens when all values are trimmed.
334 ///
335 /// ```
336 /// use std::error::Error;
337 /// use csv::{ReaderBuilder, StringRecord, Trim};
338 ///
339 /// # fn main() { example().unwrap(); }
340 /// fn example() -> Result<(), Box<dyn Error>> {
341 /// let data = "\
342 /// city , country , pop
343 /// Boston,\"
344 /// United States\",4628910
345 /// Concord, United States ,42695
346 /// ";
347 /// let mut rdr = ReaderBuilder::new()
348 /// .trim(Trim::All)
349 /// .from_reader(data.as_bytes());
350 /// let records = rdr
351 /// .records()
352 /// .collect::<Result<Vec<StringRecord>, csv::Error>>()?;
353 /// assert_eq!(records, vec![
354 /// vec!["Boston", "United States", "4628910"],
355 /// vec!["Concord", "United States", "42695"],
356 /// ]);
357 /// Ok(())
358 /// }
359 /// ```
360 pub fn trim(&mut self, trim: Trim) -> &mut ReaderBuilder {
361 self.trim = trim;
362 self
363 }
364
365 /// The record terminator to use when parsing CSV.
366 ///
367 /// A record terminator can be any single byte. The default is a special
368 /// value, `Terminator::CRLF`, which treats any occurrence of `\r`, `\n`
369 /// or `\r\n` as a single record terminator.
370 ///
371 /// # Example: `$` as a record terminator
372 ///
373 /// ```
374 /// use std::error::Error;
375 /// use csv::{ReaderBuilder, Terminator};
376 ///
377 /// # fn main() { example().unwrap(); }
378 /// fn example() -> Result<(), Box<dyn Error>> {
379 /// let data = "city,country,pop$Boston,United States,4628910";
380 /// let mut rdr = ReaderBuilder::new()
381 /// .terminator(Terminator::Any(b'$'))
382 /// .from_reader(data.as_bytes());
383 ///
384 /// if let Some(result) = rdr.records().next() {
385 /// let record = result?;
386 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
387 /// Ok(())
388 /// } else {
389 /// Err(From::from("expected at least one record but got none"))
390 /// }
391 /// }
392 /// ```
393 pub fn terminator(&mut self, term: Terminator) -> &mut ReaderBuilder {
394 self.builder.terminator(term.to_core());
395 self
396 }
397
398 /// The quote character to use when parsing CSV.
399 ///
400 /// The default is `b'"'`.
401 ///
402 /// # Example: single quotes instead of double quotes
403 ///
404 /// ```
405 /// use std::error::Error;
406 /// use csv::ReaderBuilder;
407 ///
408 /// # fn main() { example().unwrap(); }
409 /// fn example() -> Result<(), Box<dyn Error>> {
410 /// let data = "\
411 /// city,country,pop
412 /// Boston,'United States',4628910
413 /// ";
414 /// let mut rdr = ReaderBuilder::new()
415 /// .quote(b'\'')
416 /// .from_reader(data.as_bytes());
417 ///
418 /// if let Some(result) = rdr.records().next() {
419 /// let record = result?;
420 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
421 /// Ok(())
422 /// } else {
423 /// Err(From::from("expected at least one record but got none"))
424 /// }
425 /// }
426 /// ```
427 pub fn quote(&mut self, quote: u8) -> &mut ReaderBuilder {
428 self.builder.quote(quote);
429 self
430 }
431
432 /// The escape character to use when parsing CSV.
433 ///
434 /// In some variants of CSV, quotes are escaped using a special escape
435 /// character like `\` (instead of escaping quotes by doubling them).
436 ///
437 /// By default, recognizing these idiosyncratic escapes is disabled.
438 ///
439 /// # Example
440 ///
441 /// ```
442 /// use std::error::Error;
443 /// use csv::ReaderBuilder;
444 ///
445 /// # fn main() { example().unwrap(); }
446 /// fn example() -> Result<(), Box<dyn Error>> {
447 /// let data = "\
448 /// city,country,pop
449 /// Boston,\"The \\\"United\\\" States\",4628910
450 /// ";
451 /// let mut rdr = ReaderBuilder::new()
452 /// .escape(Some(b'\\'))
453 /// .from_reader(data.as_bytes());
454 ///
455 /// if let Some(result) = rdr.records().next() {
456 /// let record = result?;
457 /// assert_eq!(record, vec![
458 /// "Boston", "The \"United\" States", "4628910",
459 /// ]);
460 /// Ok(())
461 /// } else {
462 /// Err(From::from("expected at least one record but got none"))
463 /// }
464 /// }
465 /// ```
466 pub fn escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder {
467 self.builder.escape(escape);
468 self
469 }
470
471 /// Enable double quote escapes.
472 ///
473 /// This is enabled by default, but it may be disabled. When disabled,
474 /// doubled quotes are not interpreted as escapes.
475 ///
476 /// # Example
477 ///
478 /// ```
479 /// use std::error::Error;
480 /// use csv::ReaderBuilder;
481 ///
482 /// # fn main() { example().unwrap(); }
483 /// fn example() -> Result<(), Box<dyn Error>> {
484 /// let data = "\
485 /// city,country,pop
486 /// Boston,\"The \"\"United\"\" States\",4628910
487 /// ";
488 /// let mut rdr = ReaderBuilder::new()
489 /// .double_quote(false)
490 /// .from_reader(data.as_bytes());
491 ///
492 /// if let Some(result) = rdr.records().next() {
493 /// let record = result?;
494 /// assert_eq!(record, vec![
495 /// "Boston", "The \"United\"\" States\"", "4628910",
496 /// ]);
497 /// Ok(())
498 /// } else {
499 /// Err(From::from("expected at least one record but got none"))
500 /// }
501 /// }
502 /// ```
503 pub fn double_quote(&mut self, yes: bool) -> &mut ReaderBuilder {
504 self.builder.double_quote(yes);
505 self
506 }
507
508 /// Enable or disable quoting.
509 ///
510 /// This is enabled by default, but it may be disabled. When disabled,
511 /// quotes are not treated specially.
512 ///
513 /// # Example
514 ///
515 /// ```
516 /// use std::error::Error;
517 /// use csv::ReaderBuilder;
518 ///
519 /// # fn main() { example().unwrap(); }
520 /// fn example() -> Result<(), Box<dyn Error>> {
521 /// let data = "\
522 /// city,country,pop
523 /// Boston,\"The United States,4628910
524 /// ";
525 /// let mut rdr = ReaderBuilder::new()
526 /// .quoting(false)
527 /// .from_reader(data.as_bytes());
528 ///
529 /// if let Some(result) = rdr.records().next() {
530 /// let record = result?;
531 /// assert_eq!(record, vec![
532 /// "Boston", "\"The United States", "4628910",
533 /// ]);
534 /// Ok(())
535 /// } else {
536 /// Err(From::from("expected at least one record but got none"))
537 /// }
538 /// }
539 /// ```
540 pub fn quoting(&mut self, yes: bool) -> &mut ReaderBuilder {
541 self.builder.quoting(yes);
542 self
543 }
544
545 /// The comment character to use when parsing CSV.
546 ///
547 /// If the start of a record begins with the byte given here, then that
548 /// line is ignored by the CSV parser.
549 ///
550 /// This is disabled by default.
551 ///
552 /// # Example
553 ///
554 /// ```
555 /// use std::error::Error;
556 /// use csv::ReaderBuilder;
557 ///
558 /// # fn main() { example().unwrap(); }
559 /// fn example() -> Result<(), Box<dyn Error>> {
560 /// let data = "\
561 /// city,country,pop
562 /// #Concord,United States,42695
563 /// Boston,United States,4628910
564 /// ";
565 /// let mut rdr = ReaderBuilder::new()
566 /// .comment(Some(b'#'))
567 /// .from_reader(data.as_bytes());
568 ///
569 /// if let Some(result) = rdr.records().next() {
570 /// let record = result?;
571 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
572 /// Ok(())
573 /// } else {
574 /// Err(From::from("expected at least one record but got none"))
575 /// }
576 /// }
577 /// ```
578 pub fn comment(&mut self, comment: Option<u8>) -> &mut ReaderBuilder {
579 self.builder.comment(comment);
580 self
581 }
582
583 /// A convenience method for specifying a configuration to read ASCII
584 /// delimited text.
585 ///
586 /// This sets the delimiter and record terminator to the ASCII unit
587 /// separator (`\x1F`) and record separator (`\x1E`), respectively.
588 ///
589 /// # Example
590 ///
591 /// ```
592 /// use std::error::Error;
593 /// use csv::ReaderBuilder;
594 ///
595 /// # fn main() { example().unwrap(); }
596 /// fn example() -> Result<(), Box<dyn Error>> {
597 /// let data = "\
598 /// city\x1Fcountry\x1Fpop\x1EBoston\x1FUnited States\x1F4628910";
599 /// let mut rdr = ReaderBuilder::new()
600 /// .ascii()
601 /// .from_reader(data.as_bytes());
602 ///
603 /// if let Some(result) = rdr.records().next() {
604 /// let record = result?;
605 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
606 /// Ok(())
607 /// } else {
608 /// Err(From::from("expected at least one record but got none"))
609 /// }
610 /// }
611 /// ```
612 pub fn ascii(&mut self) -> &mut ReaderBuilder {
613 self.builder.ascii();
614 self
615 }
616
617 /// Set the capacity (in bytes) of the buffer used in the CSV reader.
618 /// This defaults to a reasonable setting.
619 pub fn buffer_capacity(&mut self, capacity: usize) -> &mut ReaderBuilder {
620 self.capacity = capacity;
621 self
622 }
623
624 /// Enable or disable the NFA for parsing CSV.
625 ///
626 /// This is intended to be a debug option. The NFA is always slower than
627 /// the DFA.
628 #[doc(hidden)]
629 pub fn nfa(&mut self, yes: bool) -> &mut ReaderBuilder {
630 self.builder.nfa(yes);
631 self
632 }
633}
634
635/// A already configured CSV reader.
636///
637/// A CSV reader takes as input CSV data and transforms that into standard Rust
638/// values. The most flexible way to read CSV data is as a sequence of records,
639/// where a record is a sequence of fields and each field is a string. However,
640/// a reader can also deserialize CSV data into Rust types like `i64` or
641/// `(String, f64, f64, f64)` or even a custom struct automatically using
642/// Serde.
643///
644/// # Configuration
645///
646/// A CSV reader has a couple convenient constructor methods like `from_path`
647/// and `from_reader`. However, if you want to configure the CSV reader to use
648/// a different delimiter or quote character (among many other things), then
649/// you should use a [`ReaderBuilder`](struct.ReaderBuilder.html) to construct
650/// a `Reader`. For example, to change the field delimiter:
651///
652/// ```
653/// use std::error::Error;
654/// use csv::ReaderBuilder;
655///
656/// # fn main() { example().unwrap(); }
657/// fn example() -> Result<(), Box<dyn Error>> {
658/// let data = "\
659/// city;country;pop
660/// Boston;United States;4628910
661/// ";
662/// let mut rdr = ReaderBuilder::new()
663/// .delimiter(b';')
664/// .from_reader(data.as_bytes());
665///
666/// if let Some(result) = rdr.records().next() {
667/// let record = result?;
668/// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
669/// Ok(())
670/// } else {
671/// Err(From::from("expected at least one record but got none"))
672/// }
673/// }
674/// ```
675///
676/// # Error handling
677///
678/// In general, CSV *parsing* does not ever return an error. That is, there is
679/// no such thing as malformed CSV data. Instead, this reader will prioritize
680/// finding a parse over rejecting CSV data that it does not understand. This
681/// choice was inspired by other popular CSV parsers, but also because it is
682/// pragmatic. CSV data varies wildly, so even if the CSV data is malformed,
683/// it might still be possible to work with the data. In the land of CSV, there
684/// is no "right" or "wrong," only "right" and "less right."
685///
686/// With that said, a number of errors can occur while reading CSV data:
687///
688/// * By default, all records in CSV data must have the same number of fields.
689/// If a record is found with a different number of fields than a prior
690/// record, then an error is returned. This behavior can be disabled by
691/// enabling flexible parsing via the `flexible` method on
692/// [`ReaderBuilder`](struct.ReaderBuilder.html).
693/// * When reading CSV data from a resource (like a file), it is possible for
694/// reading from the underlying resource to fail. This will return an error.
695/// For subsequent calls to the `Reader` after encountering a such error
696/// (unless `seek` is used), it will behave as if end of file had been
697/// reached, in order to avoid running into infinite loops when still
698/// attempting to read the next record when one has errored.
699/// * When reading CSV data into `String` or `&str` fields (e.g., via a
700/// [`StringRecord`](struct.StringRecord.html)), UTF-8 is strictly
701/// enforced. If CSV data is invalid UTF-8, then an error is returned. If
702/// you want to read invalid UTF-8, then you should use the byte oriented
703/// APIs such as [`ByteRecord`](struct.ByteRecord.html). If you need explicit
704/// support for another encoding entirely, then you'll need to use another
705/// crate to transcode your CSV data to UTF-8 before parsing it.
706/// * When using Serde to deserialize CSV data into Rust types, it is possible
707/// for a number of additional errors to occur. For example, deserializing
708/// a field `xyz` into an `i32` field will result in an error.
709///
710/// For more details on the precise semantics of errors, see the
711/// [`Error`](enum.Error.html) type.
712#[derive(Debug)]
713pub struct Reader<R> {
714 /// The underlying CSV parser.
715 ///
716 /// We explicitly put this on the heap because CoreReader embeds an entire
717 /// DFA transition table, which along with other things, tallies up to
718 /// almost 500 bytes on the stack.
719 core: Box<CoreReader>,
720 /// The underlying reader.
721 rdr: io::BufReader<R>,
722 /// Various state tracking.
723 ///
724 /// There is more state embedded in the `CoreReader`.
725 state: ReaderState,
726}
727
728#[derive(Debug)]
729struct ReaderState {
730 /// When set, this contains the first row of any parsed CSV data.
731 ///
732 /// This is always populated, regardless of whether `has_headers` is set.
733 headers: Option<Headers>,
734 /// When set, the first row of parsed CSV data is excluded from things
735 /// that read records, like iterators and `read_record`.
736 has_headers: bool,
737 /// When set, there is no restriction on the length of records. When not
738 /// set, every record must have the same number of fields, or else an error
739 /// is reported.
740 flexible: bool,
741 trim: Trim,
742 /// The number of fields in the first record parsed.
743 first_field_count: Option<u64>,
744 /// The current position of the parser.
745 ///
746 /// Note that this position is only observable by callers at the start
747 /// of a record. More granular positions are not supported.
748 cur_pos: Position,
749 /// Whether the first record has been read or not.
750 first: bool,
751 /// Whether the reader has been seeked or not.
752 seeked: bool,
753 /// Whether EOF of the underlying reader has been reached or not.
754 ///
755 /// IO errors on the underlying reader will be considered as an EOF for
756 /// subsequent read attempts, as it would be incorrect to keep on trying
757 /// to read when the underlying reader has broken.
758 ///
759 /// For clarity, having the best `Debug` impl and in case they need to be
760 /// treated differently at some point, we store whether the `EOF` is
761 /// considered because an actual EOF happened, or because we encoundered
762 /// an IO error.
763 /// This has no additional runtime cost.
764 eof: ReaderEofState,
765}
766
767/// Whether EOF of the underlying reader has been reached or not.
768///
769/// IO errors on the underlying reader will be considered as an EOF for
770/// subsequent read attempts, as it would be incorrect to keep on trying
771/// to read when the underlying reader has broken.
772///
773/// For clarity, having the best `Debug` impl and in case they need to be
774/// treated differently at some point, we store whether the `EOF` is
775/// considered because an actual EOF happened, or because we encoundered
776/// an IO error
777#[derive(Debug, Clone, Copy, PartialEq, Eq)]
778enum ReaderEofState {
779 NotEof,
780 Eof,
781 IOError,
782}
783
784/// Headers encapsulates any data associated with the headers of CSV data.
785///
786/// The headers always correspond to the first row.
787#[derive(Debug)]
788struct Headers {
789 /// The header, as raw bytes.
790 byte_record: ByteRecord,
791 /// The header, as valid UTF-8 (or a UTF-8 error).
792 string_record: result::Result<StringRecord, Utf8Error>,
793}
794
795impl Reader<Reader<File>> {
796 /// Create a new CSV parser with a default configuration for the given
797 /// file path.
798 ///
799 /// To customize CSV parsing, use a `ReaderBuilder`.
800 ///
801 /// # Example
802 ///
803 /// ```no_run
804 /// use std::error::Error;
805 /// use csv::Reader;
806 ///
807 /// # fn main() { example().unwrap(); }
808 /// fn example() -> Result<(), Box<dyn Error>> {
809 /// let mut rdr = Reader::from_path("foo.csv")?;
810 /// for result in rdr.records() {
811 /// let record = result?;
812 /// println!("{:?}", record);
813 /// }
814 /// Ok(())
815 /// }
816 /// ```
817 pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Reader<File>> {
818 ReaderBuilder::new().from_path(path)
819 }
820}
821
822impl<R: io::Read> Reader<R> {
823 /// Create a new CSV reader given a builder and a source of underlying
824 /// bytes.
825 fn new(builder: &ReaderBuilder, rdr: R) -> Reader<R> {
826 Reader {
827 core: Box::new(builder.builder.build()),
828 rdr: io::BufReader::with_capacity(builder.capacity, rdr),
829 state: ReaderState {
830 headers: None,
831 has_headers: builder.has_headers,
832 flexible: builder.flexible,
833 trim: builder.trim,
834 first_field_count: None,
835 cur_pos: Position::new(),
836 first: false,
837 seeked: false,
838 eof: ReaderEofState::NotEof,
839 },
840 }
841 }
842
843 /// Create a new CSV parser with a default configuration for the given
844 /// reader.
845 ///
846 /// To customize CSV parsing, use a `ReaderBuilder`.
847 ///
848 /// # Example
849 ///
850 /// ```
851 /// use std::error::Error;
852 /// use csv::Reader;
853 ///
854 /// # fn main() { example().unwrap(); }
855 /// fn example() -> Result<(), Box<dyn Error>> {
856 /// let data = "\
857 /// city,country,pop
858 /// Boston,United States,4628910
859 /// Concord,United States,42695
860 /// ";
861 /// let mut rdr = Reader::from_reader(data.as_bytes());
862 /// for result in rdr.records() {
863 /// let record = result?;
864 /// println!("{:?}", record);
865 /// }
866 /// Ok(())
867 /// }
868 /// ```
869 pub fn from_reader(rdr: R) -> Reader<R> {
870 ReaderBuilder::new().from_reader(rdr)
871 }
872
873 /// Returns a borrowed iterator over deserialized records.
874 ///
875 /// Each item yielded by this iterator is a `Result<D, Error>`.
876 /// Therefore, in order to access the record, callers must handle the
877 /// possibility of error (typically with `try!` or `?`).
878 ///
879 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
880 /// default), then this does not include the first record. Additionally,
881 /// if `has_headers` is enabled, then deserializing into a struct will
882 /// automatically align the values in each row to the fields of a struct
883 /// based on the header row.
884 ///
885 /// # Example
886 ///
887 /// This shows how to deserialize CSV data into normal Rust structs. The
888 /// fields of the header row are used to match up the values in each row
889 /// to the fields of the struct.
890 ///
891 /// ```
892 /// use std::error::Error;
893 ///
894 /// #[derive(Debug, serde::Deserialize, Eq, PartialEq)]
895 /// struct Row {
896 /// city: String,
897 /// country: String,
898 /// #[serde(rename = "popcount")]
899 /// population: u64,
900 /// }
901 ///
902 /// # fn main() { example().unwrap(); }
903 /// fn example() -> Result<(), Box<dyn Error>> {
904 /// let data = "\
905 /// city,country,popcount
906 /// Boston,United States,4628910
907 /// ";
908 /// let mut rdr = csv::Reader::from_reader(data.as_bytes());
909 /// let mut iter = rdr.deserialize();
910 ///
911 /// if let Some(result) = iter.next() {
912 /// let record: Row = result?;
913 /// assert_eq!(record, Row {
914 /// city: "Boston".to_string(),
915 /// country: "United States".to_string(),
916 /// population: 4628910,
917 /// });
918 /// Ok(())
919 /// } else {
920 /// Err(From::from("expected at least one record but got none"))
921 /// }
922 /// }
923 /// ```
924 ///
925 /// # Rules
926 ///
927 /// For the most part, any Rust type that maps straight-forwardly to a CSV
928 /// record is supported. This includes maps, structs, tuples and tuple
929 /// structs. Other Rust types, such as `Vec`s, arrays, and enums have
930 /// a more complicated story. In general, when working with CSV data, one
931 /// should avoid *nested sequences* as much as possible.
932 ///
933 /// Maps, structs, tuples and tuple structs map to CSV records in a simple
934 /// way. Tuples and tuple structs decode their fields in the order that
935 /// they are defined. Structs will do the same only if `has_headers` has
936 /// been disabled using [`ReaderBuilder`](struct.ReaderBuilder.html),
937 /// otherwise, structs and maps are deserialized based on the fields
938 /// defined in the header row. (If there is no header row, then
939 /// deserializing into a map will result in an error.)
940 ///
941 /// Nested sequences are supported in a limited capacity. Namely, they
942 /// are flattened. As a result, it's often useful to use a `Vec` to capture
943 /// a "tail" of fields in a record:
944 ///
945 /// ```
946 /// use std::error::Error;
947 ///
948 /// #[derive(Debug, serde::Deserialize, Eq, PartialEq)]
949 /// struct Row {
950 /// label: String,
951 /// values: Vec<i32>,
952 /// }
953 ///
954 /// # fn main() { example().unwrap(); }
955 /// fn example() -> Result<(), Box<dyn Error>> {
956 /// let data = "foo,1,2,3";
957 /// let mut rdr = csv::ReaderBuilder::new()
958 /// .has_headers(false)
959 /// .from_reader(data.as_bytes());
960 /// let mut iter = rdr.deserialize();
961 ///
962 /// if let Some(result) = iter.next() {
963 /// let record: Row = result?;
964 /// assert_eq!(record, Row {
965 /// label: "foo".to_string(),
966 /// values: vec![1, 2, 3],
967 /// });
968 /// Ok(())
969 /// } else {
970 /// Err(From::from("expected at least one record but got none"))
971 /// }
972 /// }
973 /// ```
974 ///
975 /// In the above example, adding another field to the `Row` struct after
976 /// the `values` field will result in a deserialization error. This is
977 /// because the deserializer doesn't know when to stop reading fields
978 /// into the `values` vector, so it will consume the rest of the fields in
979 /// the record leaving none left over for the additional field.
980 ///
981 /// Finally, simple enums in Rust can be deserialized as well. Namely,
982 /// enums must either be variants with no arguments or variants with a
983 /// single argument. Variants with no arguments are deserialized based on
984 /// which variant name the field matches. Variants with a single argument
985 /// are deserialized based on which variant can store the data. The latter
986 /// is only supported when using "untagged" enum deserialization. The
987 /// following example shows both forms in action:
988 ///
989 /// ```
990 /// use std::error::Error;
991 ///
992 /// #[derive(Debug, serde::Deserialize, PartialEq)]
993 /// struct Row {
994 /// label: Label,
995 /// value: Number,
996 /// }
997 ///
998 /// #[derive(Debug, serde::Deserialize, PartialEq)]
999 /// #[serde(rename_all = "lowercase")]
1000 /// enum Label {
1001 /// Celsius,
1002 /// Fahrenheit,
1003 /// }
1004 ///
1005 /// #[derive(Debug, serde::Deserialize, PartialEq)]
1006 /// #[serde(untagged)]
1007 /// enum Number {
1008 /// Integer(i64),
1009 /// Float(f64),
1010 /// }
1011 ///
1012 /// # fn main() { example().unwrap(); }
1013 /// fn example() -> Result<(), Box<dyn Error>> {
1014 /// let data = "\
1015 /// label,value
1016 /// celsius,22.2222
1017 /// fahrenheit,72
1018 /// ";
1019 /// let mut rdr = csv::Reader::from_reader(data.as_bytes());
1020 /// let mut iter = rdr.deserialize();
1021 ///
1022 /// // Read the first record.
1023 /// if let Some(result) = iter.next() {
1024 /// let record: Row = result?;
1025 /// assert_eq!(record, Row {
1026 /// label: Label::Celsius,
1027 /// value: Number::Float(22.2222),
1028 /// });
1029 /// } else {
1030 /// return Err(From::from(
1031 /// "expected at least two records but got none"));
1032 /// }
1033 ///
1034 /// // Read the second record.
1035 /// if let Some(result) = iter.next() {
1036 /// let record: Row = result?;
1037 /// assert_eq!(record, Row {
1038 /// label: Label::Fahrenheit,
1039 /// value: Number::Integer(72),
1040 /// });
1041 /// Ok(())
1042 /// } else {
1043 /// Err(From::from(
1044 /// "expected at least two records but got only one"))
1045 /// }
1046 /// }
1047 /// ```
1048 pub fn deserialize<D>(&mut self) -> DeserializeRecordsIter<R, D>
1049 where
1050 D: DeserializeOwned,
1051 {
1052 DeserializeRecordsIter::new(self)
1053 }
1054
1055 /// Returns an owned iterator over deserialized records.
1056 ///
1057 /// Each item yielded by this iterator is a `Result<D, Error>`.
1058 /// Therefore, in order to access the record, callers must handle the
1059 /// possibility of error (typically with `try!` or `?`).
1060 ///
1061 /// This is mostly useful when you want to return a CSV iterator or store
1062 /// it somewhere.
1063 ///
1064 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
1065 /// default), then this does not include the first record. Additionally,
1066 /// if `has_headers` is enabled, then deserializing into a struct will
1067 /// automatically align the values in each row to the fields of a struct
1068 /// based on the header row.
1069 ///
1070 /// For more detailed deserialization rules, see the documentation on the
1071 /// `deserialize` method.
1072 ///
1073 /// # Example
1074 ///
1075 /// ```
1076 /// use std::error::Error;
1077 ///
1078 /// #[derive(Debug, serde::Deserialize, Eq, PartialEq)]
1079 /// struct Row {
1080 /// city: String,
1081 /// country: String,
1082 /// #[serde(rename = "popcount")]
1083 /// population: u64,
1084 /// }
1085 ///
1086 /// # fn main() { example().unwrap(); }
1087 /// fn example() -> Result<(), Box<dyn Error>> {
1088 /// let data = "\
1089 /// city,country,popcount
1090 /// Boston,United States,4628910
1091 /// ";
1092 /// let rdr = csv::Reader::from_reader(data.as_bytes());
1093 /// let mut iter = rdr.into_deserialize();
1094 ///
1095 /// if let Some(result) = iter.next() {
1096 /// let record: Row = result?;
1097 /// assert_eq!(record, Row {
1098 /// city: "Boston".to_string(),
1099 /// country: "United States".to_string(),
1100 /// population: 4628910,
1101 /// });
1102 /// Ok(())
1103 /// } else {
1104 /// Err(From::from("expected at least one record but got none"))
1105 /// }
1106 /// }
1107 /// ```
1108 pub fn into_deserialize<D>(self) -> DeserializeRecordsIntoIter<R, D>
1109 where
1110 D: DeserializeOwned,
1111 {
1112 DeserializeRecordsIntoIter::new(self)
1113 }
1114
1115 /// Returns a borrowed iterator over all records as strings.
1116 ///
1117 /// Each item yielded by this iterator is a `Result<StringRecord, Error>`.
1118 /// Therefore, in order to access the record, callers must handle the
1119 /// possibility of error (typically with `try!` or `?`).
1120 ///
1121 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
1122 /// default), then this does not include the first record.
1123 ///
1124 /// # Example
1125 ///
1126 /// ```
1127 /// use std::error::Error;
1128 /// use csv::Reader;
1129 ///
1130 /// # fn main() { example().unwrap(); }
1131 /// fn example() -> Result<(), Box<dyn Error>> {
1132 /// let data = "\
1133 /// city,country,pop
1134 /// Boston,United States,4628910
1135 /// ";
1136 /// let mut rdr = Reader::from_reader(data.as_bytes());
1137 /// let mut iter = rdr.records();
1138 ///
1139 /// if let Some(result) = iter.next() {
1140 /// let record = result?;
1141 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1142 /// Ok(())
1143 /// } else {
1144 /// Err(From::from("expected at least one record but got none"))
1145 /// }
1146 /// }
1147 /// ```
1148 pub fn records(&mut self) -> StringRecordsIter<R> {
1149 StringRecordsIter::new(self)
1150 }
1151
1152 /// Returns an owned iterator over all records as strings.
1153 ///
1154 /// Each item yielded by this iterator is a `Result<StringRecord, Error>`.
1155 /// Therefore, in order to access the record, callers must handle the
1156 /// possibility of error (typically with `try!` or `?`).
1157 ///
1158 /// This is mostly useful when you want to return a CSV iterator or store
1159 /// it somewhere.
1160 ///
1161 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
1162 /// default), then this does not include the first record.
1163 ///
1164 /// # Example
1165 ///
1166 /// ```
1167 /// use std::error::Error;
1168 /// use csv::Reader;
1169 ///
1170 /// # fn main() { example().unwrap(); }
1171 /// fn example() -> Result<(), Box<dyn Error>> {
1172 /// let data = "\
1173 /// city,country,pop
1174 /// Boston,United States,4628910
1175 /// ";
1176 /// let rdr = Reader::from_reader(data.as_bytes());
1177 /// let mut iter = rdr.into_records();
1178 ///
1179 /// if let Some(result) = iter.next() {
1180 /// let record = result?;
1181 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1182 /// Ok(())
1183 /// } else {
1184 /// Err(From::from("expected at least one record but got none"))
1185 /// }
1186 /// }
1187 /// ```
1188 pub fn into_records(self) -> StringRecordsIntoIter<R> {
1189 StringRecordsIntoIter::new(self)
1190 }
1191
1192 /// Returns a borrowed iterator over all records as raw bytes.
1193 ///
1194 /// Each item yielded by this iterator is a `Result<ByteRecord, Error>`.
1195 /// Therefore, in order to access the record, callers must handle the
1196 /// possibility of error (typically with `try!` or `?`).
1197 ///
1198 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
1199 /// default), then this does not include the first record.
1200 ///
1201 /// # Example
1202 ///
1203 /// ```
1204 /// use std::error::Error;
1205 /// use csv::Reader;
1206 ///
1207 /// # fn main() { example().unwrap(); }
1208 /// fn example() -> Result<(), Box<dyn Error>> {
1209 /// let data = "\
1210 /// city,country,pop
1211 /// Boston,United States,4628910
1212 /// ";
1213 /// let mut rdr = Reader::from_reader(data.as_bytes());
1214 /// let mut iter = rdr.byte_records();
1215 ///
1216 /// if let Some(result) = iter.next() {
1217 /// let record = result?;
1218 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1219 /// Ok(())
1220 /// } else {
1221 /// Err(From::from("expected at least one record but got none"))
1222 /// }
1223 /// }
1224 /// ```
1225 pub fn byte_records(&mut self) -> ByteRecordsIter<R> {
1226 ByteRecordsIter::new(self)
1227 }
1228
1229 /// Returns an owned iterator over all records as raw bytes.
1230 ///
1231 /// Each item yielded by this iterator is a `Result<ByteRecord, Error>`.
1232 /// Therefore, in order to access the record, callers must handle the
1233 /// possibility of error (typically with `try!` or `?`).
1234 ///
1235 /// This is mostly useful when you want to return a CSV iterator or store
1236 /// it somewhere.
1237 ///
1238 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
1239 /// default), then this does not include the first record.
1240 ///
1241 /// # Example
1242 ///
1243 /// ```
1244 /// use std::error::Error;
1245 /// use csv::Reader;
1246 ///
1247 /// # fn main() { example().unwrap(); }
1248 /// fn example() -> Result<(), Box<dyn Error>> {
1249 /// let data = "\
1250 /// city,country,pop
1251 /// Boston,United States,4628910
1252 /// ";
1253 /// let rdr = Reader::from_reader(data.as_bytes());
1254 /// let mut iter = rdr.into_byte_records();
1255 ///
1256 /// if let Some(result) = iter.next() {
1257 /// let record = result?;
1258 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1259 /// Ok(())
1260 /// } else {
1261 /// Err(From::from("expected at least one record but got none"))
1262 /// }
1263 /// }
1264 /// ```
1265 pub fn into_byte_records(self) -> ByteRecordsIntoIter<R> {
1266 ByteRecordsIntoIter::new(self)
1267 }
1268
1269 /// Returns a reference to the first row read by this parser.
1270 ///
1271 /// If no row has been read yet, then this will force parsing of the first
1272 /// row.
1273 ///
1274 /// If there was a problem parsing the row or if it wasn't valid UTF-8,
1275 /// then this returns an error.
1276 ///
1277 /// If the underlying reader emits EOF before any data, then this returns
1278 /// an empty record.
1279 ///
1280 /// Note that this method may be used regardless of whether `has_headers`
1281 /// was enabled (but it is enabled by default).
1282 ///
1283 /// # Example
1284 ///
1285 /// This example shows how to get the header row of CSV data. Notice that
1286 /// the header row does not appear as a record in the iterator!
1287 ///
1288 /// ```
1289 /// use std::error::Error;
1290 /// use csv::Reader;
1291 ///
1292 /// # fn main() { example().unwrap(); }
1293 /// fn example() -> Result<(), Box<dyn Error>> {
1294 /// let data = "\
1295 /// city,country,pop
1296 /// Boston,United States,4628910
1297 /// ";
1298 /// let mut rdr = Reader::from_reader(data.as_bytes());
1299 ///
1300 /// // We can read the headers before iterating.
1301 /// {
1302 /// // `headers` borrows from the reader, so we put this in its
1303 /// // own scope. That way, the borrow ends before we try iterating
1304 /// // below. Alternatively, we could clone the headers.
1305 /// let headers = rdr.headers()?;
1306 /// assert_eq!(headers, vec!["city", "country", "pop"]);
1307 /// }
1308 ///
1309 /// if let Some(result) = rdr.records().next() {
1310 /// let record = result?;
1311 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1312 /// } else {
1313 /// return Err(From::from(
1314 /// "expected at least one record but got none"))
1315 /// }
1316 ///
1317 /// // We can also read the headers after iterating.
1318 /// let headers = rdr.headers()?;
1319 /// assert_eq!(headers, vec!["city", "country", "pop"]);
1320 /// Ok(())
1321 /// }
1322 /// ```
1323 pub fn headers(&mut self) -> Result<&StringRecord> {
1324 if self.state.headers.is_none() {
1325 let mut record = ByteRecord::new();
1326 self.read_byte_record_impl(&mut record)?;
1327 self.set_headers_impl(Err(record));
1328 }
1329 let headers = self.state.headers.as_ref().unwrap();
1330 match headers.string_record {
1331 Ok(ref record) => Ok(record),
1332 Err(ref err) => Err(Error::new(ErrorKind::Utf8 {
1333 pos: headers.byte_record.position().map(Clone::clone),
1334 err: err.clone(),
1335 })),
1336 }
1337 }
1338
1339 /// Returns a reference to the first row read by this parser as raw bytes.
1340 ///
1341 /// If no row has been read yet, then this will force parsing of the first
1342 /// row.
1343 ///
1344 /// If there was a problem parsing the row then this returns an error.
1345 ///
1346 /// If the underlying reader emits EOF before any data, then this returns
1347 /// an empty record.
1348 ///
1349 /// Note that this method may be used regardless of whether `has_headers`
1350 /// was enabled (but it is enabled by default).
1351 ///
1352 /// # Example
1353 ///
1354 /// This example shows how to get the header row of CSV data. Notice that
1355 /// the header row does not appear as a record in the iterator!
1356 ///
1357 /// ```
1358 /// use std::error::Error;
1359 /// use csv::Reader;
1360 ///
1361 /// # fn main() { example().unwrap(); }
1362 /// fn example() -> Result<(), Box<dyn Error>> {
1363 /// let data = "\
1364 /// city,country,pop
1365 /// Boston,United States,4628910
1366 /// ";
1367 /// let mut rdr = Reader::from_reader(data.as_bytes());
1368 ///
1369 /// // We can read the headers before iterating.
1370 /// {
1371 /// // `headers` borrows from the reader, so we put this in its
1372 /// // own scope. That way, the borrow ends before we try iterating
1373 /// // below. Alternatively, we could clone the headers.
1374 /// let headers = rdr.byte_headers()?;
1375 /// assert_eq!(headers, vec!["city", "country", "pop"]);
1376 /// }
1377 ///
1378 /// if let Some(result) = rdr.byte_records().next() {
1379 /// let record = result?;
1380 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1381 /// } else {
1382 /// return Err(From::from(
1383 /// "expected at least one record but got none"))
1384 /// }
1385 ///
1386 /// // We can also read the headers after iterating.
1387 /// let headers = rdr.byte_headers()?;
1388 /// assert_eq!(headers, vec!["city", "country", "pop"]);
1389 /// Ok(())
1390 /// }
1391 /// ```
1392 pub fn byte_headers(&mut self) -> Result<&ByteRecord> {
1393 if self.state.headers.is_none() {
1394 let mut record = ByteRecord::new();
1395 self.read_byte_record_impl(&mut record)?;
1396 self.set_headers_impl(Err(record));
1397 }
1398 Ok(&self.state.headers.as_ref().unwrap().byte_record)
1399 }
1400
1401 /// Set the headers of this CSV parser manually.
1402 ///
1403 /// This overrides any other setting (including `set_byte_headers`). Any
1404 /// automatic detection of headers is disabled. This may be called at any
1405 /// time.
1406 ///
1407 /// # Example
1408 ///
1409 /// ```
1410 /// use std::error::Error;
1411 /// use csv::{Reader, StringRecord};
1412 ///
1413 /// # fn main() { example().unwrap(); }
1414 /// fn example() -> Result<(), Box<dyn Error>> {
1415 /// let data = "\
1416 /// city,country,pop
1417 /// Boston,United States,4628910
1418 /// ";
1419 /// let mut rdr = Reader::from_reader(data.as_bytes());
1420 ///
1421 /// assert_eq!(rdr.headers()?, vec!["city", "country", "pop"]);
1422 /// rdr.set_headers(StringRecord::from(vec!["a", "b", "c"]));
1423 /// assert_eq!(rdr.headers()?, vec!["a", "b", "c"]);
1424 ///
1425 /// Ok(())
1426 /// }
1427 /// ```
1428 pub fn set_headers(&mut self, headers: StringRecord) {
1429 self.set_headers_impl(Ok(headers));
1430 }
1431
1432 /// Set the headers of this CSV parser manually as raw bytes.
1433 ///
1434 /// This overrides any other setting (including `set_headers`). Any
1435 /// automatic detection of headers is disabled. This may be called at any
1436 /// time.
1437 ///
1438 /// # Example
1439 ///
1440 /// ```
1441 /// use std::error::Error;
1442 /// use csv::{Reader, ByteRecord};
1443 ///
1444 /// # fn main() { example().unwrap(); }
1445 /// fn example() -> Result<(), Box<dyn Error>> {
1446 /// let data = "\
1447 /// city,country,pop
1448 /// Boston,United States,4628910
1449 /// ";
1450 /// let mut rdr = Reader::from_reader(data.as_bytes());
1451 ///
1452 /// assert_eq!(rdr.byte_headers()?, vec!["city", "country", "pop"]);
1453 /// rdr.set_byte_headers(ByteRecord::from(vec!["a", "b", "c"]));
1454 /// assert_eq!(rdr.byte_headers()?, vec!["a", "b", "c"]);
1455 ///
1456 /// Ok(())
1457 /// }
1458 /// ```
1459 pub fn set_byte_headers(&mut self, headers: ByteRecord) {
1460 self.set_headers_impl(Err(headers));
1461 }
1462
1463 fn set_headers_impl(
1464 &mut self,
1465 headers: result::Result<StringRecord, ByteRecord>,
1466 ) {
1467 // If we have string headers, then get byte headers. But if we have
1468 // byte headers, then get the string headers (or a UTF-8 error).
1469 let (mut str_headers, mut byte_headers) = match headers {
1470 Ok(string) => {
1471 let bytes = string.clone().into_byte_record();
1472 (Ok(string), bytes)
1473 }
1474 Err(bytes) => {
1475 match StringRecord::from_byte_record(bytes.clone()) {
1476 Ok(str_headers) => (Ok(str_headers), bytes),
1477 Err(err) => (Err(err.utf8_error().clone()), bytes),
1478 }
1479 }
1480 };
1481 if self.state.trim.should_trim_headers() {
1482 if let Ok(ref mut str_headers) = str_headers.as_mut() {
1483 str_headers.trim();
1484 }
1485 byte_headers.trim();
1486 }
1487 self.state.headers = Some(Headers {
1488 byte_record: byte_headers,
1489 string_record: str_headers,
1490 });
1491 }
1492
1493 /// Read a single row into the given record. Returns false when no more
1494 /// records could be read.
1495 ///
1496 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
1497 /// default), then this will never read the first record.
1498 ///
1499 /// This method is useful when you want to read records as fast as
1500 /// as possible. It's less ergonomic than an iterator, but it permits the
1501 /// caller to reuse the `StringRecord` allocation, which usually results
1502 /// in higher throughput.
1503 ///
1504 /// Records read via this method are guaranteed to have a position set
1505 /// on them, even if the reader is at EOF or if an error is returned.
1506 ///
1507 /// # Example
1508 ///
1509 /// ```
1510 /// use std::error::Error;
1511 /// use csv::{Reader, StringRecord};
1512 ///
1513 /// # fn main() { example().unwrap(); }
1514 /// fn example() -> Result<(), Box<dyn Error>> {
1515 /// let data = "\
1516 /// city,country,pop
1517 /// Boston,United States,4628910
1518 /// ";
1519 /// let mut rdr = Reader::from_reader(data.as_bytes());
1520 /// let mut record = StringRecord::new();
1521 ///
1522 /// if rdr.read_record(&mut record)? {
1523 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1524 /// Ok(())
1525 /// } else {
1526 /// Err(From::from("expected at least one record but got none"))
1527 /// }
1528 /// }
1529 /// ```
1530 pub fn read_record(&mut self, record: &mut StringRecord) -> Result<bool> {
1531 let result = record.read(self);
1532 // We need to trim again because trimming string records includes
1533 // Unicode whitespace. (ByteRecord trimming only includes ASCII
1534 // whitespace.)
1535 if self.state.trim.should_trim_fields() {
1536 record.trim();
1537 }
1538 result
1539 }
1540
1541 /// Read a single row into the given byte record. Returns false when no
1542 /// more records could be read.
1543 ///
1544 /// If `has_headers` was enabled via a `ReaderBuilder` (which is the
1545 /// default), then this will never read the first record.
1546 ///
1547 /// This method is useful when you want to read records as fast as
1548 /// as possible. It's less ergonomic than an iterator, but it permits the
1549 /// caller to reuse the `ByteRecord` allocation, which usually results
1550 /// in higher throughput.
1551 ///
1552 /// Records read via this method are guaranteed to have a position set
1553 /// on them, even if the reader is at EOF or if an error is returned.
1554 ///
1555 /// # Example
1556 ///
1557 /// ```
1558 /// use std::error::Error;
1559 /// use csv::{ByteRecord, Reader};
1560 ///
1561 /// # fn main() { example().unwrap(); }
1562 /// fn example() -> Result<(), Box<dyn Error>> {
1563 /// let data = "\
1564 /// city,country,pop
1565 /// Boston,United States,4628910
1566 /// ";
1567 /// let mut rdr = Reader::from_reader(data.as_bytes());
1568 /// let mut record = ByteRecord::new();
1569 ///
1570 /// if rdr.read_byte_record(&mut record)? {
1571 /// assert_eq!(record, vec!["Boston", "United States", "4628910"]);
1572 /// Ok(())
1573 /// } else {
1574 /// Err(From::from("expected at least one record but got none"))
1575 /// }
1576 /// }
1577 /// ```
1578 pub fn read_byte_record(
1579 &mut self,
1580 record: &mut ByteRecord,
1581 ) -> Result<bool> {
1582 if !self.state.seeked && !self.state.has_headers && !self.state.first {
1583 // If the caller indicated "no headers" and we haven't yielded the
1584 // first record yet, then we should yield our header row if we have
1585 // one.
1586 if let Some(ref headers) = self.state.headers {
1587 self.state.first = true;
1588 record.clone_from(&headers.byte_record);
1589 if self.state.trim.should_trim_fields() {
1590 record.trim();
1591 }
1592 return Ok(!record.is_empty());
1593 }
1594 }
1595 let ok = self.read_byte_record_impl(record)?;
1596 self.state.first = true;
1597 if !self.state.seeked && self.state.headers.is_none() {
1598 self.set_headers_impl(Err(record.clone()));
1599 // If the end user indicated that we have headers, then we should
1600 // never return the first row. Instead, we should attempt to
1601 // read and return the next one.
1602 if self.state.has_headers {
1603 let result = self.read_byte_record_impl(record);
1604 if self.state.trim.should_trim_fields() {
1605 record.trim();
1606 }
1607 return result;
1608 }
1609 } else if self.state.trim.should_trim_fields() {
1610 record.trim();
1611 }
1612 Ok(ok)
1613 }
1614
1615 /// Read a byte record from the underlying CSV reader, without accounting
1616 /// for headers.
1617 #[inline(always)]
1618 fn read_byte_record_impl(
1619 &mut self,
1620 record: &mut ByteRecord,
1621 ) -> Result<bool> {
1622 use csv_core::ReadRecordResult::*;
1623
1624 record.clear();
1625 record.set_position(Some(self.state.cur_pos.clone()));
1626 if self.state.eof != ReaderEofState::NotEof {
1627 return Ok(false);
1628 }
1629 let (mut outlen, mut endlen) = (0, 0);
1630 loop {
1631 let (res, nin, nout, nend) = {
1632 let input_res = self.rdr.fill_buf();
1633 if input_res.is_err() {
1634 self.state.eof = ReaderEofState::IOError;
1635 }
1636 let input = input_res?;
1637 let (fields, ends) = record.as_parts();
1638 self.core.read_record(
1639 input,
1640 &mut fields[outlen..],
1641 &mut ends[endlen..],
1642 )
1643 };
1644 self.rdr.consume(nin);
1645 let byte = self.state.cur_pos.byte();
1646 self.state
1647 .cur_pos
1648 .set_byte(byte + nin as u64)
1649 .set_line(self.core.line());
1650 outlen += nout;
1651 endlen += nend;
1652 match res {
1653 InputEmpty => continue,
1654 OutputFull => {
1655 record.expand_fields();
1656 continue;
1657 }
1658 OutputEndsFull => {
1659 record.expand_ends();
1660 continue;
1661 }
1662 Record => {
1663 record.set_len(endlen);
1664 self.state.add_record(record)?;
1665 return Ok(true);
1666 }
1667 End => {
1668 self.state.eof = ReaderEofState::Eof;
1669 return Ok(false);
1670 }
1671 }
1672 }
1673 }
1674
1675 /// Return the current position of this CSV reader.
1676 ///
1677 /// The byte offset in the position returned can be used to `seek` this
1678 /// reader. In particular, seeking to a position returned here on the same
1679 /// data will result in parsing the same subsequent record.
1680 ///
1681 /// # Example: reading the position
1682 ///
1683 /// ```
1684 /// use std::{error::Error, io};
1685 /// use csv::{Reader, Position};
1686 ///
1687 /// # fn main() { example().unwrap(); }
1688 /// fn example() -> Result<(), Box<dyn Error>> {
1689 /// let data = "\
1690 /// city,country,popcount
1691 /// Boston,United States,4628910
1692 /// Concord,United States,42695
1693 /// ";
1694 /// let rdr = Reader::from_reader(io::Cursor::new(data));
1695 /// let mut iter = rdr.into_records();
1696 /// let mut pos = Position::new();
1697 /// loop {
1698 /// // Read the position immediately before each record.
1699 /// let next_pos = iter.reader().position().clone();
1700 /// if iter.next().is_none() {
1701 /// break;
1702 /// }
1703 /// pos = next_pos;
1704 /// }
1705 ///
1706 /// // `pos` should now be the position immediately before the last
1707 /// // record.
1708 /// assert_eq!(pos.byte(), 51);
1709 /// assert_eq!(pos.line(), 3);
1710 /// assert_eq!(pos.record(), 2);
1711 /// Ok(())
1712 /// }
1713 /// ```
1714 pub fn position(&self) -> &Position {
1715 &self.state.cur_pos
1716 }
1717
1718 /// Returns true if and only if this reader has been exhausted.
1719 ///
1720 /// When this returns true, no more records can be read from this reader
1721 /// (unless it has been seeked to another position).
1722 ///
1723 /// # Example
1724 ///
1725 /// ```
1726 /// use std::{error::Error, io};
1727 /// use csv::{Reader, Position};
1728 ///
1729 /// # fn main() { example().unwrap(); }
1730 /// fn example() -> Result<(), Box<dyn Error>> {
1731 /// let data = "\
1732 /// city,country,popcount
1733 /// Boston,United States,4628910
1734 /// Concord,United States,42695
1735 /// ";
1736 /// let mut rdr = Reader::from_reader(io::Cursor::new(data));
1737 /// assert!(!rdr.is_done());
1738 /// for result in rdr.records() {
1739 /// let _ = result?;
1740 /// }
1741 /// assert!(rdr.is_done());
1742 /// Ok(())
1743 /// }
1744 /// ```
1745 pub fn is_done(&self) -> bool {
1746 self.state.eof != ReaderEofState::NotEof
1747 }
1748
1749 /// Returns true if and only if this reader has been configured to
1750 /// interpret the first record as a header record.
1751 pub fn has_headers(&self) -> bool {
1752 self.state.has_headers
1753 }
1754
1755 /// Returns a reference to the underlying reader.
1756 pub fn get_ref(&self) -> &R {
1757 self.rdr.get_ref()
1758 }
1759
1760 /// Returns a mutable reference to the underlying reader.
1761 pub fn get_mut(&mut self) -> &mut R {
1762 self.rdr.get_mut()
1763 }
1764
1765 /// Unwraps this CSV reader, returning the underlying reader.
1766 ///
1767 /// Note that any leftover data inside this reader's internal buffer is
1768 /// lost.
1769 pub fn into_inner(self) -> R {
1770 self.rdr.into_inner()
1771 }
1772}
1773
1774impl<R: io::Read + io::Seek> Reader<R> {
1775 /// Seeks the underlying reader to the position given.
1776 ///
1777 /// This comes with a few caveats:
1778 ///
1779 /// * Any internal buffer associated with this reader is cleared.
1780 /// * If the given position does not correspond to a position immediately
1781 /// before the start of a record, then the behavior of this reader is
1782 /// unspecified.
1783 /// * Any special logic that skips the first record in the CSV reader
1784 /// when reading or iterating over records is disabled.
1785 ///
1786 /// If the given position has a byte offset equivalent to the current
1787 /// position, then no seeking is performed.
1788 ///
1789 /// If the header row has not already been read, then this will attempt
1790 /// to read the header row before seeking. Therefore, it is possible that
1791 /// this returns an error associated with reading CSV data.
1792 ///
1793 /// Note that seeking is performed based only on the byte offset in the
1794 /// given position. Namely, the record or line numbers in the position may
1795 /// be incorrect, but this will cause any future position generated by
1796 /// this CSV reader to be similarly incorrect.
1797 ///
1798 /// # Example: seek to parse a record twice
1799 ///
1800 /// ```
1801 /// use std::{error::Error, io};
1802 /// use csv::{Reader, Position};
1803 ///
1804 /// # fn main() { example().unwrap(); }
1805 /// fn example() -> Result<(), Box<dyn Error>> {
1806 /// let data = "\
1807 /// city,country,popcount
1808 /// Boston,United States,4628910
1809 /// Concord,United States,42695
1810 /// ";
1811 /// let rdr = Reader::from_reader(io::Cursor::new(data));
1812 /// let mut iter = rdr.into_records();
1813 /// let mut pos = Position::new();
1814 /// loop {
1815 /// // Read the position immediately before each record.
1816 /// let next_pos = iter.reader().position().clone();
1817 /// if iter.next().is_none() {
1818 /// break;
1819 /// }
1820 /// pos = next_pos;
1821 /// }
1822 ///
1823 /// // Now seek the reader back to `pos`. This will let us read the
1824 /// // last record again.
1825 /// iter.reader_mut().seek(pos)?;
1826 /// let mut iter = iter.into_reader().into_records();
1827 /// if let Some(result) = iter.next() {
1828 /// let record = result?;
1829 /// assert_eq!(record, vec!["Concord", "United States", "42695"]);
1830 /// Ok(())
1831 /// } else {
1832 /// Err(From::from("expected at least one record but got none"))
1833 /// }
1834 /// }
1835 /// ```
1836 pub fn seek(&mut self, pos: Position) -> Result<()> {
1837 self.byte_headers()?;
1838 self.state.seeked = true;
1839 if pos.byte() == self.state.cur_pos.byte() {
1840 return Ok(());
1841 }
1842 self.rdr.seek(io::SeekFrom::Start(pos.byte()))?;
1843 self.core.reset();
1844 self.core.set_line(pos.line());
1845 self.state.cur_pos = pos;
1846 self.state.eof = ReaderEofState::NotEof;
1847 Ok(())
1848 }
1849
1850 /// This is like `seek`, but provides direct control over how the seeking
1851 /// operation is performed via `io::SeekFrom`.
1852 ///
1853 /// The `pos` position given *should* correspond the position indicated
1854 /// by `seek_from`, but there is no requirement. If the `pos` position
1855 /// given is incorrect, then the position information returned by this
1856 /// reader will be similarly incorrect.
1857 ///
1858 /// If the header row has not already been read, then this will attempt
1859 /// to read the header row before seeking. Therefore, it is possible that
1860 /// this returns an error associated with reading CSV data.
1861 ///
1862 /// Unlike `seek`, this will always cause an actual seek to be performed.
1863 pub fn seek_raw(
1864 &mut self,
1865 seek_from: io::SeekFrom,
1866 pos: Position,
1867 ) -> Result<()> {
1868 self.byte_headers()?;
1869 self.state.seeked = true;
1870 self.rdr.seek(seek_from)?;
1871 self.core.reset();
1872 self.core.set_line(pos.line());
1873 self.state.cur_pos = pos;
1874 self.state.eof = ReaderEofState::NotEof;
1875 Ok(())
1876 }
1877}
1878
1879impl ReaderState {
1880 #[inline(always)]
1881 fn add_record(&mut self, record: &ByteRecord) -> Result<()> {
1882 let i = self.cur_pos.record();
1883 self.cur_pos.set_record(i.checked_add(1).unwrap());
1884 if !self.flexible {
1885 match self.first_field_count {
1886 None => self.first_field_count = Some(record.len() as u64),
1887 Some(expected) => {
1888 if record.len() as u64 != expected {
1889 return Err(Error::new(ErrorKind::UnequalLengths {
1890 pos: record.position().map(Clone::clone),
1891 expected_len: expected,
1892 len: record.len() as u64,
1893 }));
1894 }
1895 }
1896 }
1897 }
1898 Ok(())
1899 }
1900}
1901
1902/// An owned iterator over deserialized records.
1903///
1904/// The type parameter `R` refers to the underlying `io::Read` type, and `D`
1905/// refers to the type that this iterator will deserialize a record into.
1906pub struct DeserializeRecordsIntoIter<R, D> {
1907 rdr: Reader<R>,
1908 rec: StringRecord,
1909 headers: Option<StringRecord>,
1910 _priv: PhantomData<D>,
1911}
1912
1913impl<R: io::Read, D: DeserializeOwned> DeserializeRecordsIntoIter<R, D> {
1914 fn new(mut rdr: Reader<R>) -> DeserializeRecordsIntoIter<R, D> {
1915 let headers = if !rdr.state.has_headers {
1916 None
1917 } else {
1918 rdr.headers().ok().map(Clone::clone)
1919 };
1920 DeserializeRecordsIntoIter {
1921 rdr,
1922 rec: StringRecord::new(),
1923 headers,
1924 _priv: PhantomData,
1925 }
1926 }
1927
1928 /// Return a reference to the underlying CSV reader.
1929 pub fn reader(&self) -> &Reader<R> {
1930 &self.rdr
1931 }
1932
1933 /// Return a mutable reference to the underlying CSV reader.
1934 pub fn reader_mut(&mut self) -> &mut Reader<R> {
1935 &mut self.rdr
1936 }
1937
1938 /// Drop this iterator and return the underlying CSV reader.
1939 pub fn into_reader(self) -> Reader<R> {
1940 self.rdr
1941 }
1942}
1943
1944impl<R: io::Read, D: DeserializeOwned> Iterator
1945 for DeserializeRecordsIntoIter<R, D>
1946{
1947 type Item = Result<D>;
1948
1949 fn next(&mut self) -> Option<Result<D>> {
1950 match self.rdr.read_record(&mut self.rec) {
1951 Err(err) => Some(Err(err)),
1952 Ok(false) => None,
1953 Ok(true) => Some(self.rec.deserialize(self.headers.as_ref())),
1954 }
1955 }
1956}
1957
1958/// A borrowed iterator over deserialized records.
1959///
1960/// The lifetime parameter `'r` refers to the lifetime of the underlying
1961/// CSV `Reader`. The type parameter `R` refers to the underlying `io::Read`
1962/// type, and `D` refers to the type that this iterator will deserialize a
1963/// record into.
1964pub struct DeserializeRecordsIter<'r, R: 'r, D> {
1965 rdr: &'r mut Reader<R>,
1966 rec: StringRecord,
1967 headers: Option<StringRecord>,
1968 _priv: PhantomData<D>,
1969}
1970
1971impl<'r, R: io::Read, D: DeserializeOwned> DeserializeRecordsIter<'r, R, D> {
1972 fn new(rdr: &'r mut Reader<R>) -> DeserializeRecordsIter<'r, R, D> {
1973 let headers = if !rdr.state.has_headers {
1974 None
1975 } else {
1976 rdr.headers().ok().map(Clone::clone)
1977 };
1978 DeserializeRecordsIter {
1979 rdr,
1980 rec: StringRecord::new(),
1981 headers,
1982 _priv: PhantomData,
1983 }
1984 }
1985
1986 /// Return a reference to the underlying CSV reader.
1987 pub fn reader(&self) -> &Reader<R> {
1988 &self.rdr
1989 }
1990
1991 /// Return a mutable reference to the underlying CSV reader.
1992 pub fn reader_mut(&mut self) -> &mut Reader<R> {
1993 &mut self.rdr
1994 }
1995}
1996
1997impl<'r, R: io::Read, D: DeserializeOwned> Iterator
1998 for DeserializeRecordsIter<'r, R, D>
1999{
2000 type Item = Result<D>;
2001
2002 fn next(&mut self) -> Option<Result<D>> {
2003 match self.rdr.read_record(&mut self.rec) {
2004 Err(err) => Some(Err(err)),
2005 Ok(false) => None,
2006 Ok(true) => Some(self.rec.deserialize(self.headers.as_ref())),
2007 }
2008 }
2009}
2010
2011/// An owned iterator over records as strings.
2012pub struct StringRecordsIntoIter<R> {
2013 rdr: Reader<R>,
2014 rec: StringRecord,
2015}
2016
2017impl<R: io::Read> StringRecordsIntoIter<R> {
2018 fn new(rdr: Reader<R>) -> StringRecordsIntoIter<R> {
2019 StringRecordsIntoIter { rdr, rec: StringRecord::new() }
2020 }
2021
2022 /// Return a reference to the underlying CSV reader.
2023 pub fn reader(&self) -> &Reader<R> {
2024 &self.rdr
2025 }
2026
2027 /// Return a mutable reference to the underlying CSV reader.
2028 pub fn reader_mut(&mut self) -> &mut Reader<R> {
2029 &mut self.rdr
2030 }
2031
2032 /// Drop this iterator and return the underlying CSV reader.
2033 pub fn into_reader(self) -> Reader<R> {
2034 self.rdr
2035 }
2036}
2037
2038impl<R: io::Read> Iterator for StringRecordsIntoIter<R> {
2039 type Item = Result<StringRecord>;
2040
2041 fn next(&mut self) -> Option<Result<StringRecord>> {
2042 match self.rdr.read_record(&mut self.rec) {
2043 Err(err) => Some(Err(err)),
2044 Ok(true) => Some(Ok(self.rec.clone_truncated())),
2045 Ok(false) => None,
2046 }
2047 }
2048}
2049
2050/// A borrowed iterator over records as strings.
2051///
2052/// The lifetime parameter `'r` refers to the lifetime of the underlying
2053/// CSV `Reader`.
2054pub struct StringRecordsIter<'r, R: 'r> {
2055 rdr: &'r mut Reader<R>,
2056 rec: StringRecord,
2057}
2058
2059impl<'r, R: io::Read> StringRecordsIter<'r, R> {
2060 fn new(rdr: &'r mut Reader<R>) -> StringRecordsIter<'r, R> {
2061 StringRecordsIter { rdr, rec: StringRecord::new() }
2062 }
2063
2064 /// Return a reference to the underlying CSV reader.
2065 pub fn reader(&self) -> &Reader<R> {
2066 &self.rdr
2067 }
2068
2069 /// Return a mutable reference to the underlying CSV reader.
2070 pub fn reader_mut(&mut self) -> &mut Reader<R> {
2071 &mut self.rdr
2072 }
2073}
2074
2075impl<'r, R: io::Read> Iterator for StringRecordsIter<'r, R> {
2076 type Item = Result<StringRecord>;
2077
2078 fn next(&mut self) -> Option<Result<StringRecord>> {
2079 match self.rdr.read_record(&mut self.rec) {
2080 Err(err) => Some(Err(err)),
2081 Ok(true) => Some(Ok(self.rec.clone_truncated())),
2082 Ok(false) => None,
2083 }
2084 }
2085}
2086
2087/// An owned iterator over records as raw bytes.
2088pub struct ByteRecordsIntoIter<R> {
2089 rdr: Reader<R>,
2090 rec: ByteRecord,
2091}
2092
2093impl<R: io::Read> ByteRecordsIntoIter<R> {
2094 fn new(rdr: Reader<R>) -> ByteRecordsIntoIter<R> {
2095 ByteRecordsIntoIter { rdr, rec: ByteRecord::new() }
2096 }
2097
2098 /// Return a reference to the underlying CSV reader.
2099 pub fn reader(&self) -> &Reader<R> {
2100 &self.rdr
2101 }
2102
2103 /// Return a mutable reference to the underlying CSV reader.
2104 pub fn reader_mut(&mut self) -> &mut Reader<R> {
2105 &mut self.rdr
2106 }
2107
2108 /// Drop this iterator and return the underlying CSV reader.
2109 pub fn into_reader(self) -> Reader<R> {
2110 self.rdr
2111 }
2112}
2113
2114impl<R: io::Read> Iterator for ByteRecordsIntoIter<R> {
2115 type Item = Result<ByteRecord>;
2116
2117 fn next(&mut self) -> Option<Result<ByteRecord>> {
2118 match self.rdr.read_byte_record(&mut self.rec) {
2119 Err(err) => Some(Err(err)),
2120 Ok(true) => Some(Ok(self.rec.clone_truncated())),
2121 Ok(false) => None,
2122 }
2123 }
2124}
2125
2126/// A borrowed iterator over records as raw bytes.
2127///
2128/// The lifetime parameter `'r` refers to the lifetime of the underlying
2129/// CSV `Reader`.
2130pub struct ByteRecordsIter<'r, R: 'r> {
2131 rdr: &'r mut Reader<R>,
2132 rec: ByteRecord,
2133}
2134
2135impl<'r, R: io::Read> ByteRecordsIter<'r, R> {
2136 fn new(rdr: &'r mut Reader<R>) -> ByteRecordsIter<'r, R> {
2137 ByteRecordsIter { rdr, rec: ByteRecord::new() }
2138 }
2139
2140 /// Return a reference to the underlying CSV reader.
2141 pub fn reader(&self) -> &Reader<R> {
2142 &self.rdr
2143 }
2144
2145 /// Return a mutable reference to the underlying CSV reader.
2146 pub fn reader_mut(&mut self) -> &mut Reader<R> {
2147 &mut self.rdr
2148 }
2149}
2150
2151impl<'r, R: io::Read> Iterator for ByteRecordsIter<'r, R> {
2152 type Item = Result<ByteRecord>;
2153
2154 fn next(&mut self) -> Option<Result<ByteRecord>> {
2155 match self.rdr.read_byte_record(&mut self.rec) {
2156 Err(err) => Some(Err(err)),
2157 Ok(true) => Some(Ok(self.rec.clone_truncated())),
2158 Ok(false) => None,
2159 }
2160 }
2161}
2162
2163#[cfg(test)]
2164mod tests {
2165 use std::io;
2166
2167 use crate::{
2168 byte_record::ByteRecord, error::ErrorKind, string_record::StringRecord,
2169 };
2170
2171 use super::{Position, ReaderBuilder, Trim};
2172
2173 fn b(s: &str) -> &[u8] {
2174 s.as_bytes()
2175 }
2176 fn s(b: &[u8]) -> &str {
2177 ::std::str::from_utf8(b).unwrap()
2178 }
2179
2180 fn newpos(byte: u64, line: u64, record: u64) -> Position {
2181 let mut p = Position::new();
2182 p.set_byte(byte).set_line(line).set_record(record);
2183 p
2184 }
2185
2186 #[test]
2187 fn read_byte_record() {
2188 let data = b("foo,\"b,ar\",baz\nabc,mno,xyz");
2189 let mut rdr =
2190 ReaderBuilder::new().has_headers(false).from_reader(data);
2191 let mut rec = ByteRecord::new();
2192
2193 assert!(rdr.read_byte_record(&mut rec).unwrap());
2194 assert_eq!(3, rec.len());
2195 assert_eq!("foo", s(&rec[0]));
2196 assert_eq!("b,ar", s(&rec[1]));
2197 assert_eq!("baz", s(&rec[2]));
2198
2199 assert!(rdr.read_byte_record(&mut rec).unwrap());
2200 assert_eq!(3, rec.len());
2201 assert_eq!("abc", s(&rec[0]));
2202 assert_eq!("mno", s(&rec[1]));
2203 assert_eq!("xyz", s(&rec[2]));
2204
2205 assert!(!rdr.read_byte_record(&mut rec).unwrap());
2206 }
2207
2208 #[test]
2209 fn read_trimmed_records_and_headers() {
2210 let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t");
2211 let mut rdr = ReaderBuilder::new()
2212 .has_headers(true)
2213 .trim(Trim::All)
2214 .from_reader(data);
2215 let mut rec = ByteRecord::new();
2216 assert!(rdr.read_byte_record(&mut rec).unwrap());
2217 assert_eq!("1", s(&rec[0]));
2218 assert_eq!("2", s(&rec[1]));
2219 assert_eq!("3", s(&rec[2]));
2220 let mut rec = StringRecord::new();
2221 assert!(rdr.read_record(&mut rec).unwrap());
2222 assert_eq!("1", &rec[0]);
2223 assert_eq!("", &rec[1]);
2224 assert_eq!("3", &rec[2]);
2225 {
2226 let headers = rdr.headers().unwrap();
2227 assert_eq!(3, headers.len());
2228 assert_eq!("foo", &headers[0]);
2229 assert_eq!("bar", &headers[1]);
2230 assert_eq!("baz", &headers[2]);
2231 }
2232 }
2233
2234 #[test]
2235 fn read_trimmed_header() {
2236 let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t");
2237 let mut rdr = ReaderBuilder::new()
2238 .has_headers(true)
2239 .trim(Trim::Headers)
2240 .from_reader(data);
2241 let mut rec = ByteRecord::new();
2242 assert!(rdr.read_byte_record(&mut rec).unwrap());
2243 assert_eq!(" 1", s(&rec[0]));
2244 assert_eq!(" 2", s(&rec[1]));
2245 assert_eq!(" 3", s(&rec[2]));
2246 {
2247 let headers = rdr.headers().unwrap();
2248 assert_eq!(3, headers.len());
2249 assert_eq!("foo", &headers[0]);
2250 assert_eq!("bar", &headers[1]);
2251 assert_eq!("baz", &headers[2]);
2252 }
2253 }
2254
2255 #[test]
2256 fn read_trimed_header_invalid_utf8() {
2257 let data = &b"foo, b\xFFar,\tbaz\na,b,c\nd,e,f"[..];
2258 let mut rdr = ReaderBuilder::new()
2259 .has_headers(true)
2260 .trim(Trim::Headers)
2261 .from_reader(data);
2262 let mut rec = StringRecord::new();
2263
2264 // force the headers to be read
2265 let _ = rdr.read_record(&mut rec);
2266 // Check the byte headers are trimmed
2267 {
2268 let headers = rdr.byte_headers().unwrap();
2269 assert_eq!(3, headers.len());
2270 assert_eq!(b"foo", &headers[0]);
2271 assert_eq!(b"b\xFFar", &headers[1]);
2272 assert_eq!(b"baz", &headers[2]);
2273 }
2274 match *rdr.headers().unwrap_err().kind() {
2275 ErrorKind::Utf8 { pos: Some(ref pos), ref err } => {
2276 assert_eq!(pos, &newpos(0, 1, 0));
2277 assert_eq!(err.field(), 1);
2278 assert_eq!(err.valid_up_to(), 3);
2279 }
2280 ref err => panic!("match failed, got {:?}", err),
2281 }
2282 }
2283
2284 #[test]
2285 fn read_trimmed_records() {
2286 let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t");
2287 let mut rdr = ReaderBuilder::new()
2288 .has_headers(true)
2289 .trim(Trim::Fields)
2290 .from_reader(data);
2291 let mut rec = ByteRecord::new();
2292 assert!(rdr.read_byte_record(&mut rec).unwrap());
2293 assert_eq!("1", s(&rec[0]));
2294 assert_eq!("2", s(&rec[1]));
2295 assert_eq!("3", s(&rec[2]));
2296 {
2297 let headers = rdr.headers().unwrap();
2298 assert_eq!(3, headers.len());
2299 assert_eq!("foo", &headers[0]);
2300 assert_eq!(" bar", &headers[1]);
2301 assert_eq!("\tbaz", &headers[2]);
2302 }
2303 }
2304
2305 #[test]
2306 fn read_record_unequal_fails() {
2307 let data = b("foo\nbar,baz");
2308 let mut rdr =
2309 ReaderBuilder::new().has_headers(false).from_reader(data);
2310 let mut rec = ByteRecord::new();
2311
2312 assert!(rdr.read_byte_record(&mut rec).unwrap());
2313 assert_eq!(1, rec.len());
2314 assert_eq!("foo", s(&rec[0]));
2315
2316 match rdr.read_byte_record(&mut rec) {
2317 Err(err) => match *err.kind() {
2318 ErrorKind::UnequalLengths {
2319 expected_len: 1,
2320 ref pos,
2321 len: 2,
2322 } => {
2323 assert_eq!(pos, &Some(newpos(4, 2, 1)));
2324 }
2325 ref wrong => panic!("match failed, got {:?}", wrong),
2326 },
2327 wrong => panic!("match failed, got {:?}", wrong),
2328 }
2329 }
2330
2331 #[test]
2332 fn read_record_unequal_ok() {
2333 let data = b("foo\nbar,baz");
2334 let mut rdr = ReaderBuilder::new()
2335 .has_headers(false)
2336 .flexible(true)
2337 .from_reader(data);
2338 let mut rec = ByteRecord::new();
2339
2340 assert!(rdr.read_byte_record(&mut rec).unwrap());
2341 assert_eq!(1, rec.len());
2342 assert_eq!("foo", s(&rec[0]));
2343
2344 assert!(rdr.read_byte_record(&mut rec).unwrap());
2345 assert_eq!(2, rec.len());
2346 assert_eq!("bar", s(&rec[0]));
2347 assert_eq!("baz", s(&rec[1]));
2348
2349 assert!(!rdr.read_byte_record(&mut rec).unwrap());
2350 }
2351
2352 // This tests that even if we get a CSV error, we can continue reading
2353 // if we want.
2354 #[test]
2355 fn read_record_unequal_continue() {
2356 let data = b("foo\nbar,baz\nquux");
2357 let mut rdr =
2358 ReaderBuilder::new().has_headers(false).from_reader(data);
2359 let mut rec = ByteRecord::new();
2360
2361 assert!(rdr.read_byte_record(&mut rec).unwrap());
2362 assert_eq!(1, rec.len());
2363 assert_eq!("foo", s(&rec[0]));
2364
2365 match rdr.read_byte_record(&mut rec) {
2366 Err(err) => match err.kind() {
2367 &ErrorKind::UnequalLengths {
2368 expected_len: 1,
2369 ref pos,
2370 len: 2,
2371 } => {
2372 assert_eq!(pos, &Some(newpos(4, 2, 1)));
2373 }
2374 wrong => panic!("match failed, got {:?}", wrong),
2375 },
2376 wrong => panic!("match failed, got {:?}", wrong),
2377 }
2378
2379 assert!(rdr.read_byte_record(&mut rec).unwrap());
2380 assert_eq!(1, rec.len());
2381 assert_eq!("quux", s(&rec[0]));
2382
2383 assert!(!rdr.read_byte_record(&mut rec).unwrap());
2384 }
2385
2386 #[test]
2387 fn read_record_headers() {
2388 let data = b("foo,bar,baz\na,b,c\nd,e,f");
2389 let mut rdr = ReaderBuilder::new().has_headers(true).from_reader(data);
2390 let mut rec = StringRecord::new();
2391
2392 assert!(rdr.read_record(&mut rec).unwrap());
2393 assert_eq!(3, rec.len());
2394 assert_eq!("a", &rec[0]);
2395
2396 assert!(rdr.read_record(&mut rec).unwrap());
2397 assert_eq!(3, rec.len());
2398 assert_eq!("d", &rec[0]);
2399
2400 assert!(!rdr.read_record(&mut rec).unwrap());
2401
2402 {
2403 let headers = rdr.byte_headers().unwrap();
2404 assert_eq!(3, headers.len());
2405 assert_eq!(b"foo", &headers[0]);
2406 assert_eq!(b"bar", &headers[1]);
2407 assert_eq!(b"baz", &headers[2]);
2408 }
2409 {
2410 let headers = rdr.headers().unwrap();
2411 assert_eq!(3, headers.len());
2412 assert_eq!("foo", &headers[0]);
2413 assert_eq!("bar", &headers[1]);
2414 assert_eq!("baz", &headers[2]);
2415 }
2416 }
2417
2418 #[test]
2419 fn read_record_headers_invalid_utf8() {
2420 let data = &b"foo,b\xFFar,baz\na,b,c\nd,e,f"[..];
2421 let mut rdr = ReaderBuilder::new().has_headers(true).from_reader(data);
2422 let mut rec = StringRecord::new();
2423
2424 assert!(rdr.read_record(&mut rec).unwrap());
2425 assert_eq!(3, rec.len());
2426 assert_eq!("a", &rec[0]);
2427
2428 assert!(rdr.read_record(&mut rec).unwrap());
2429 assert_eq!(3, rec.len());
2430 assert_eq!("d", &rec[0]);
2431
2432 assert!(!rdr.read_record(&mut rec).unwrap());
2433
2434 // Check that we can read the headers as raw bytes, but that
2435 // if we read them as strings, we get an appropriate UTF-8 error.
2436 {
2437 let headers = rdr.byte_headers().unwrap();
2438 assert_eq!(3, headers.len());
2439 assert_eq!(b"foo", &headers[0]);
2440 assert_eq!(b"b\xFFar", &headers[1]);
2441 assert_eq!(b"baz", &headers[2]);
2442 }
2443 match *rdr.headers().unwrap_err().kind() {
2444 ErrorKind::Utf8 { pos: Some(ref pos), ref err } => {
2445 assert_eq!(pos, &newpos(0, 1, 0));
2446 assert_eq!(err.field(), 1);
2447 assert_eq!(err.valid_up_to(), 1);
2448 }
2449 ref err => panic!("match failed, got {:?}", err),
2450 }
2451 }
2452
2453 #[test]
2454 fn read_record_no_headers_before() {
2455 let data = b("foo,bar,baz\na,b,c\nd,e,f");
2456 let mut rdr =
2457 ReaderBuilder::new().has_headers(false).from_reader(data);
2458 let mut rec = StringRecord::new();
2459
2460 {
2461 let headers = rdr.headers().unwrap();
2462 assert_eq!(3, headers.len());
2463 assert_eq!("foo", &headers[0]);
2464 assert_eq!("bar", &headers[1]);
2465 assert_eq!("baz", &headers[2]);
2466 }
2467
2468 assert!(rdr.read_record(&mut rec).unwrap());
2469 assert_eq!(3, rec.len());
2470 assert_eq!("foo", &rec[0]);
2471
2472 assert!(rdr.read_record(&mut rec).unwrap());
2473 assert_eq!(3, rec.len());
2474 assert_eq!("a", &rec[0]);
2475
2476 assert!(rdr.read_record(&mut rec).unwrap());
2477 assert_eq!(3, rec.len());
2478 assert_eq!("d", &rec[0]);
2479
2480 assert!(!rdr.read_record(&mut rec).unwrap());
2481 }
2482
2483 #[test]
2484 fn read_record_no_headers_after() {
2485 let data = b("foo,bar,baz\na,b,c\nd,e,f");
2486 let mut rdr =
2487 ReaderBuilder::new().has_headers(false).from_reader(data);
2488 let mut rec = StringRecord::new();
2489
2490 assert!(rdr.read_record(&mut rec).unwrap());
2491 assert_eq!(3, rec.len());
2492 assert_eq!("foo", &rec[0]);
2493
2494 assert!(rdr.read_record(&mut rec).unwrap());
2495 assert_eq!(3, rec.len());
2496 assert_eq!("a", &rec[0]);
2497
2498 assert!(rdr.read_record(&mut rec).unwrap());
2499 assert_eq!(3, rec.len());
2500 assert_eq!("d", &rec[0]);
2501
2502 assert!(!rdr.read_record(&mut rec).unwrap());
2503
2504 let headers = rdr.headers().unwrap();
2505 assert_eq!(3, headers.len());
2506 assert_eq!("foo", &headers[0]);
2507 assert_eq!("bar", &headers[1]);
2508 assert_eq!("baz", &headers[2]);
2509 }
2510
2511 #[test]
2512 fn seek() {
2513 let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i");
2514 let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data));
2515 rdr.seek(newpos(18, 3, 2)).unwrap();
2516
2517 let mut rec = StringRecord::new();
2518
2519 assert_eq!(18, rdr.position().byte());
2520 assert!(rdr.read_record(&mut rec).unwrap());
2521 assert_eq!(3, rec.len());
2522 assert_eq!("d", &rec[0]);
2523
2524 assert_eq!(24, rdr.position().byte());
2525 assert_eq!(4, rdr.position().line());
2526 assert_eq!(3, rdr.position().record());
2527 assert!(rdr.read_record(&mut rec).unwrap());
2528 assert_eq!(3, rec.len());
2529 assert_eq!("g", &rec[0]);
2530
2531 assert!(!rdr.read_record(&mut rec).unwrap());
2532 }
2533
2534 // Test that we can read headers after seeking even if the headers weren't
2535 // explicit read before seeking.
2536 #[test]
2537 fn seek_headers_after() {
2538 let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i");
2539 let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data));
2540 rdr.seek(newpos(18, 3, 2)).unwrap();
2541 assert_eq!(rdr.headers().unwrap(), vec!["foo", "bar", "baz"]);
2542 }
2543
2544 // Test that we can read headers after seeking if the headers were read
2545 // before seeking.
2546 #[test]
2547 fn seek_headers_before_after() {
2548 let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i");
2549 let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data));
2550 let headers = rdr.headers().unwrap().clone();
2551 rdr.seek(newpos(18, 3, 2)).unwrap();
2552 assert_eq!(&headers, rdr.headers().unwrap());
2553 }
2554
2555 // Test that even if we didn't read headers before seeking, if we seek to
2556 // the current byte offset, then no seeking is done and therefore we can
2557 // still read headers after seeking.
2558 #[test]
2559 fn seek_headers_no_actual_seek() {
2560 let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i");
2561 let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data));
2562 rdr.seek(Position::new()).unwrap();
2563 assert_eq!("foo", &rdr.headers().unwrap()[0]);
2564 }
2565
2566 // Test that position info is reported correctly in absence of headers.
2567 #[test]
2568 fn positions_no_headers() {
2569 let mut rdr = ReaderBuilder::new()
2570 .has_headers(false)
2571 .from_reader("a,b,c\nx,y,z".as_bytes())
2572 .into_records();
2573
2574 let pos = rdr.next().unwrap().unwrap().position().unwrap().clone();
2575 assert_eq!(pos.byte(), 0);
2576 assert_eq!(pos.line(), 1);
2577 assert_eq!(pos.record(), 0);
2578
2579 let pos = rdr.next().unwrap().unwrap().position().unwrap().clone();
2580 assert_eq!(pos.byte(), 6);
2581 assert_eq!(pos.line(), 2);
2582 assert_eq!(pos.record(), 1);
2583 }
2584
2585 // Test that position info is reported correctly with headers.
2586 #[test]
2587 fn positions_headers() {
2588 let mut rdr = ReaderBuilder::new()
2589 .has_headers(true)
2590 .from_reader("a,b,c\nx,y,z".as_bytes())
2591 .into_records();
2592
2593 let pos = rdr.next().unwrap().unwrap().position().unwrap().clone();
2594 assert_eq!(pos.byte(), 6);
2595 assert_eq!(pos.line(), 2);
2596 assert_eq!(pos.record(), 1);
2597 }
2598
2599 // Test that reading headers on empty data yields an empty record.
2600 #[test]
2601 fn headers_on_empty_data() {
2602 let mut rdr = ReaderBuilder::new().from_reader("".as_bytes());
2603 let r = rdr.byte_headers().unwrap();
2604 assert_eq!(r.len(), 0);
2605 }
2606
2607 // Test that reading the first record on empty data works.
2608 #[test]
2609 fn no_headers_on_empty_data() {
2610 let mut rdr =
2611 ReaderBuilder::new().has_headers(false).from_reader("".as_bytes());
2612 assert_eq!(rdr.records().count(), 0);
2613 }
2614
2615 // Test that reading the first record on empty data works, even if
2616 // we've tried to read headers before hand.
2617 #[test]
2618 fn no_headers_on_empty_data_after_headers() {
2619 let mut rdr =
2620 ReaderBuilder::new().has_headers(false).from_reader("".as_bytes());
2621 assert_eq!(rdr.headers().unwrap().len(), 0);
2622 assert_eq!(rdr.records().count(), 0);
2623 }
2624}