Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Allow for escaping the delimiter in reader
Browse files Browse the repository at this point in the history
leaty committed May 31, 2021
1 parent 40ea4c4 commit 7102087
Showing 2 changed files with 80 additions and 10 deletions.
57 changes: 48 additions & 9 deletions csv-core/src/reader.rs
Original file line number Diff line number Diff line change
@@ -428,14 +428,15 @@ enum NfaState {
InQuotedField = 3,
InEscapedQuote = 4,
InDoubleEscapedQuote = 5,
InComment = 6,
InEscapeSequence = 6,
InComment = 7,
// All states below are "final field" states.
// Namely, they indicate that a field has been parsed.
EndFieldDelim = 7,
EndFieldDelim = 8,
// All states below are "final record" states.
// Namely, they indicate that a record has been parsed.
EndRecord = 8,
CRLF = 9,
EndRecord = 9,
CRLF = 10,
}

/// A list of NFA states that have an explicit representation in the DFA.
@@ -447,6 +448,7 @@ const NFA_STATES: &'static [NfaState] = &[
NfaState::InQuotedField,
NfaState::InEscapedQuote,
NfaState::InDoubleEscapedQuote,
NfaState::InEscapeSequence,
NfaState::InComment,
NfaState::EndRecord,
NfaState::CRLF,
@@ -805,9 +807,9 @@ impl Reader {
self.dfa.classes.add(self.delimiter);
if self.quoting {
self.dfa.classes.add(self.quote);
if let Some(escape) = self.escape {
self.dfa.classes.add(escape);
}
}
if let Some(escape) = self.escape {
self.dfa.classes.add(escape);
}
if let Some(comment) = self.comment {
self.dfa.classes.add(comment);
@@ -970,7 +972,7 @@ impl Reader {
match state {
End | StartRecord | EndRecord | InComment | CRLF => End,
StartField | EndFieldDelim | EndFieldTerm | InField
| InQuotedField | InEscapedQuote | InDoubleEscapedQuote
| InQuotedField | InEscapedQuote | InDoubleEscapedQuote | InEscapeSequence
| InRecordTerm => EndRecord,
}
}
@@ -1007,6 +1009,8 @@ impl Reader {
(EndFieldDelim, NfaInputAction::Discard)
} else if self.term.equals(c) {
(EndFieldTerm, NfaInputAction::Epsilon)
} else if !self.quoting && self.escape == Some(c) {
(InEscapeSequence, NfaInputAction::Discard)
} else {
(InField, NfaInputAction::CopyToOutput)
}
@@ -1018,6 +1022,8 @@ impl Reader {
(EndFieldDelim, NfaInputAction::Discard)
} else if self.term.equals(c) {
(EndFieldTerm, NfaInputAction::Epsilon)
} else if !self.quoting && self.escape == Some(c) {
(InEscapeSequence, NfaInputAction::Discard)
} else {
(InField, NfaInputAction::CopyToOutput)
}
@@ -1043,6 +1049,7 @@ impl Reader {
(InField, NfaInputAction::CopyToOutput)
}
}
InEscapeSequence => (InField, NfaInputAction::CopyToOutput),
InComment => {
if b'\n' == c {
(StartRecord, NfaInputAction::Discard)
@@ -1087,7 +1094,7 @@ impl Reader {
/// be reached by epsilon transitions will never have explicit usage in the
/// DFA.
const TRANS_CLASSES: usize = 7;
const DFA_STATES: usize = 10;
const DFA_STATES: usize = 11;
const TRANS_SIZE: usize = TRANS_CLASSES * DFA_STATES;

/// The number of possible transition classes. (See the comment on `TRANS_SIZE`
@@ -1119,6 +1126,8 @@ struct Dfa {
in_field: DfaState,
/// The DFA state corresponding to being inside an quoted field.
in_quoted: DfaState,
/// The DFA state corresponding to being in an escape sequence.
in_escape_sequence: DfaState,
/// The minimum DFA state that indicates a field has been parsed. All DFA
/// states greater than this are also final-field states.
final_field: DfaState,
@@ -1135,6 +1144,7 @@ impl Dfa {
classes: DfaClasses::new(),
in_field: DfaState(0),
in_quoted: DfaState(0),
in_escape_sequence: DfaState(0),
final_field: DfaState(0),
final_record: DfaState(0),
}
@@ -1170,6 +1180,7 @@ impl Dfa {
fn finish(&mut self) {
self.in_field = self.new_state(NfaState::InField);
self.in_quoted = self.new_state(NfaState::InQuotedField);
self.in_escape_sequence = self.new_state(NfaState::InEscapeSequence);
self.final_field = self.new_state(NfaState::EndFieldDelim);
self.final_record = self.new_state(NfaState::EndRecord);
}
@@ -1665,6 +1676,15 @@ mod tests {
}
);

parses_to!(
escape_sequence,
"a\\,b\\\\c,\\,fo\"o\\,,bar",
csv![["a,b\\c", ",fo\"o,", "bar"]],
|b: &mut ReaderBuilder| {
b.quoting(false).escape(Some(b'\\'));
}
);

parses_to!(
delimiter_tabs,
"a\tb",
@@ -1863,6 +1883,25 @@ mod tests {
assert_read!(rdr, &[], out, 0, 0, End);
}

// Test we can read escape sequences correctly in a stream.
#[test]
fn stream_escape_sequence() {
use crate::ReadFieldResult::*;

let out = &mut [0; 10];
let mut builder = ReaderBuilder::new();
let mut rdr = builder.quoting(false).escape(Some(b'\\')).build();

assert_read!(rdr, b("\\,f\\\\o\\"), out, 7, 4, InputEmpty);
assert_eq!(&out[..4], b(",f\\o"));

assert_read!(rdr, b(",o\\,"), &mut out[4..], 4, 3, InputEmpty);
assert_eq!(&out[..7], b(",f\\o,o,"));

assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
assert_read!(rdr, &[], out, 0, 0, End);
}

// Test that empty output buffers don't wreak havoc.
#[test]
fn stream_empty_output() {
33 changes: 32 additions & 1 deletion src/reader.rs
Original file line number Diff line number Diff line change
@@ -428,9 +428,12 @@ impl ReaderBuilder {
/// In some variants of CSV, quotes are escaped using a special escape
/// character like `\` (instead of escaping quotes by doubling them).
///
/// Other variants of CSV may use an escape character to escape delimiters instead
/// of using quoted fields, this is supported only when quoting is disabled.
///
/// By default, recognizing these idiosyncratic escapes is disabled.
///
/// # Example
/// # Example with escaped quotes
///
/// ```
/// use std::error::Error;
@@ -457,6 +460,34 @@ impl ReaderBuilder {
/// }
/// }
/// ```
///
/// # Example with escaped delimiters
///
/// ```
/// use std::error::Error;
/// use csv::ReaderBuilder;
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<(), Box<dyn Error>> {
/// let data = "\
/// city,country,pop
/// Boston,The\\, United\\, States,4628910
/// ";
/// let mut rdr = ReaderBuilder::new()
/// .quoting(false)
/// .escape(Some(b'\\'))
/// .from_reader(data.as_bytes());
///
/// if let Some(result) = rdr.records().next() {
/// let record = result?;
/// assert_eq!(record, vec![
/// "Boston", "The, United, States", "4628910",
/// ]);
/// Ok(())
/// } else {
/// Err(From::from("expected at least one record but got none"))
/// }
/// }
/// ```
pub fn escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder {
self.builder.escape(escape);
self

0 comments on commit 7102087

Please sign in to comment.