Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions src/main/java/org/apache/commons/csv/Lexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -277,15 +277,22 @@ Token nextToken(final Token token) throws IOException {
}
// Important: make sure a new char gets consumed in each iteration
while (token.type == Token.Type.INVALID) {
// isDelimiter consumes the trailing characters of a multi-character delimiter as a side effect, so it must
// only be evaluated once per character. Remember a match found while skipping whitespace below.
boolean delimiter = false;
// ignore whitespaces at beginning of a token
if (ignoreSurroundingSpaces) {
while (Character.isWhitespace((char) c) && !isDelimiter(c) && !eol) {
while (Character.isWhitespace((char) c) && !eol) {
if (isDelimiter(c)) {
delimiter = true;
break;
}
c = reader.read();
eol = readEndOfLine(c);
}
}
// ok, start of token reached: encapsulated, or token
if (isDelimiter(c)) {
if (delimiter || isDelimiter(c)) {
// empty token return TOKEN("")
token.type = Token.Type.TOKEN;
} else if (eol) {
Expand Down
20 changes: 20 additions & 0 deletions src/test/java/org/apache/commons/csv/CSVParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1758,6 +1758,26 @@ void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException {
}
}

/**
* With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace,
* the empty field at the delimiter boundary must survive. The delimiter look-ahead is consumed while skipping
* leading whitespace, so re-evaluating it would drop the empty field and merge the following field's value.
*/
@Test
void testEmptyFieldBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException {
final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get();
try (CSVParser parser = CSVParser.parse(" |a", format)) {
final List<CSVRecord> records = parser.getRecords();
assertEquals(1, records.size());
assertValuesEquals(new String[] { "", "a" }, records.get(0));
}
try (CSVParser parser = CSVParser.parse("a | |b", format)) {
final List<CSVRecord> records = parser.getRecords();
assertEquals(1, records.size());
assertValuesEquals(new String[] { "a", "", "b" }, records.get(0));
}
}

@Test
void testProvidedHeader() throws Exception {
final Reader in = new StringReader("a,b,c\n1,2,3\nx,y,z");
Expand Down
19 changes: 19 additions & 0 deletions src/test/java/org/apache/commons/csv/LexerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,25 @@ void testPartialMultiCharacterDelimiterAtEOFAfterMismatch() throws IOException {
}
}

/**
* With {@code ignoreSurroundingSpaces} enabled and a multi-character delimiter whose first character is whitespace,
* the side-effecting {@link Lexer#isDelimiter(int)} must only be evaluated once per character, otherwise the
* delimiter is consumed in the whitespace-skip loop and the empty field at the boundary is dropped.
*/
@Test
void testEmptyTokenBeforeWhitespacePrefixedMultiCharacterDelimiter() throws IOException {
final CSVFormat format = CSVFormat.DEFAULT.builder().setDelimiter(" |").setIgnoreSurroundingSpaces(true).get();
try (Lexer lexer = createLexer(" |a", format)) {
assertNextToken(TOKEN, "", lexer);
assertNextToken(EOF, "a", lexer);
}
try (Lexer lexer = createLexer("a | |b", format)) {
assertNextToken(TOKEN, "a", lexer);
assertNextToken(TOKEN, "", lexer);
assertNextToken(EOF, "b", lexer);
}
}

@Test
void testReadEscapeBackspace() throws IOException {
try (Lexer lexer = createLexer("b", CSVFormat.DEFAULT.withEscape('\b'))) {
Expand Down
Loading