Author: yonik
Date: Sat Jan 5 07:37:26 2008
New Revision: 609155
URL: http://svn.apache.org/viewvc?rev=609155&view=rev
Log:
SANDBOX-206: add escape to strategy, turn off backslash-style escaping by
default
Modified:
commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVParser.java
commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVStrategy.java
commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVParserTest.java
commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVStrategyTest.java
Modified:
commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVParser.java
URL:
http://svn.apache.org/viewvc/commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVParser.java?rev=609155&r1=609154&r2=609155&view=diff
==============================================================================
--- commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVParser.java
(original)
+++ commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVParser.java
Sat Jan 5 07:37:26 2008
@@ -134,7 +134,7 @@
* @deprecated use [EMAIL PROTECTED] #CSVParser(Reader,CSVStrategy)}.
*/
public CSVParser(Reader input, char delimiter) {
- this(input, delimiter, '"', (char) 0);
+ this(input, delimiter, '"', CSVStrategy.COMMENTS_DISABLED);
}
/**
@@ -347,7 +347,7 @@
eol = isEndOfLine(c);
}
// ok, start of token reached: comment, encapsulated, or token
- if (!strategy.isCommentingDisabled() && c == strategy.getCommentStart())
{
+ if (c == strategy.getCommentStart()) {
// ignore everything till end of line and continue (incr linecount)
in.readLine();
tkn = nextToken(tkn.reset());
@@ -400,19 +400,22 @@
*/
private Token simpleTokenLexer(Token tkn, int c) throws IOException {
wsBuf.clear();
- while (!tkn.isReady) {
+ for (;;) {
if (isEndOfLine(c)) {
// end of record
tkn.type = TT_EORECORD;
tkn.isReady = true;
+ return tkn;
} else if (isEndOfFile(c)) {
// end of file
tkn.type = TT_EOF;
tkn.isReady = true;
+ return tkn;
} else if (c == strategy.getDelimiter()) {
// end of token
tkn.type = TT_TOKEN;
tkn.isReady = true;
+ return tkn;
} else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() &&
in.lookAhead() == 'u') {
// interpret unicode escaped chars (like \u0070 -> p)
tkn.content.append((char) unicodeEscapeLexer(c));
@@ -422,6 +425,8 @@
if (tkn.content.length() > 0) {
wsBuf.append((char) c);
}
+ } else if (c == strategy.getEscape()) {
+ tkn.content.append((char)readEscape(c));
} else {
// prepend whitespaces (if we have)
if (wsBuf.length() > 0) {
@@ -435,7 +440,6 @@
c = in.read();
}
}
- return tkn;
}
@@ -457,70 +461,55 @@
int startLineNumber = getLineNumber();
// ignore the given delimiter
// assert c == delimiter;
- c = in.read();
- while (!tkn.isReady) {
- boolean skipRead = false;
- if (c == strategy.getEncapsulator() || c == '\\') {
- // check lookahead
+ for (;;) {
+ c = in.read();
+
+ if (c == '\\' && strategy.getUnicodeEscapeInterpretation() &&
in.lookAhead()=='u') {
+ tkn.content.append((char) unicodeEscapeLexer(c));
+ } else if (c == strategy.getEscape()) {
+ tkn.content.append((char)readEscape(c));
+ } else if (c == strategy.getEncapsulator()) {
if (in.lookAhead() == strategy.getEncapsulator()) {
// double or escaped encapsulator -> add single encapsulator to token
c = in.read();
tkn.content.append((char) c);
- } else if (c == '\\' && in.lookAhead() == '\\') {
- // doubled escape char, it does not escape itself, only encapsulator
- // -> add both escape chars to stream
- tkn.content.append((char) c);
- c = in.read();
- tkn.content.append((char) c);
- } else if (
- strategy.getUnicodeEscapeInterpretation()
- && c == '\\'
- && in.lookAhead() == 'u') {
- // interpret unicode escaped chars (like \u0070 -> p)
- tkn.content.append((char) unicodeEscapeLexer(c));
- } else if (c == '\\') {
- // use a single escape character -> add it to stream
- tkn.content.append((char) c);
} else {
// token finish mark (encapsulator) reached: ignore whitespace till
delimiter
- while (!tkn.isReady) {
+ for (;;) {
c = in.read();
if (c == strategy.getDelimiter()) {
tkn.type = TT_TOKEN;
tkn.isReady = true;
+ return tkn;
} else if (isEndOfFile(c)) {
tkn.type = TT_EOF;
tkn.isReady = true;
+ return tkn;
} else if (isEndOfLine(c)) {
// ok eo token reached
tkn.type = TT_EORECORD;
tkn.isReady = true;
+ return tkn;
} else if (!isWhitespace(c)) {
- // error invalid char between token and next delimiter
- throw new IOException(
- "(line " + getLineNumber()
- + ") invalid char between encapsulated token end delimiter"
- );
- }
+ // error invalid char between token and next delimiter
+ throw new IOException(
+ "(line " + getLineNumber()
+ + ") invalid char between encapsulated token end
delimiter"
+ );
+ }
}
- skipRead = true;
}
} else if (isEndOfFile(c)) {
// error condition (end of file before end of token)
throw new IOException(
- "(startline " + startLineNumber + ")"
- + "eof reached before encapsulated token finished"
- );
+ "(startline " + startLineNumber + ")"
+ + "eof reached before encapsulated token finished"
+ );
} else {
// consume character
tkn.content.append((char) c);
}
- // get the next char
- if (!tkn.isReady && !skipRead) {
- c = in.read();
- }
}
- return tkn;
}
@@ -553,6 +542,21 @@
+ code.toString() + "'" + e.toString());
}
return ret;
+ }
+
+ private int readEscape(int c) throws IOException {
+ // assume c is the escape char (normally a backslash)
+ c = in.read();
+ int out;
+ switch (c) {
+ case 'r': out='\r'; break;
+ case 'n': out='\n'; break;
+ case 't': out='\t'; break;
+ case 'b': out='\b'; break;
+ case 'f': out='\f'; break;
+ default : out=c;
+ }
+ return out;
}
// ======================================================
Modified:
commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVStrategy.java
URL:
http://svn.apache.org/viewvc/commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVStrategy.java?rev=609155&r1=609154&r2=609155&view=diff
==============================================================================
--- commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVStrategy.java
(original)
+++ commons/sandbox/csv/trunk/src/java/org/apache/commons/csv/CSVStrategy.java
Sat Jan 5 07:37:26 2008
@@ -28,15 +28,21 @@
private char delimiter;
private char encapsulator;
private char commentStart;
+ private char escape;
private boolean ignoreLeadingWhitespaces;
private boolean interpretUnicodeEscapes;
private boolean ignoreEmptyLines;
- public static char COMMENTS_DISABLED = (char) 0;
-
- public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"',
COMMENTS_DISABLED, true, false, true);
- public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"',
COMMENTS_DISABLED, false, false, false);
- public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"',
COMMENTS_DISABLED, true, false, true);
+ // -2 is used to signal disabled, because it won't be confused with
+ // an EOF signal (-1), and because \ufffe in UTF-16 would be
+ // encoded as two chars (using surrogates) and thus there should never
+ // be a collision with a real text char.
+ public static char COMMENTS_DISABLED = (char)-2;
+ public static char ESCAPE_DISABLED = (char)-2;
+
+ public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"',
COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true);
+ public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"',
COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false);
+ public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"',
COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true);
public CSVStrategy(char delimiter, char encapsulator, char commentStart) {
@@ -58,7 +64,8 @@
public CSVStrategy(
char delimiter,
char encapsulator,
- char commentStart,
+ char commentStart,
+ char escape,
boolean ignoreLeadingWhitespace,
boolean interpretUnicodeEscapes,
boolean ignoreEmptyLines)
@@ -66,11 +73,25 @@
setDelimiter(delimiter);
setEncapsulator(encapsulator);
setCommentStart(commentStart);
+ setEscape(escape);
setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace);
setUnicodeEscapeInterpretation(interpretUnicodeEscapes);
setIgnoreEmptyLines(ignoreEmptyLines);
}
+ /** @deprecated */
+ public CSVStrategy(
+ char delimiter,
+ char encapsulator,
+ char commentStart,
+ boolean ignoreLeadingWhitespace,
+ boolean interpretUnicodeEscapes,
+ boolean ignoreEmptyLines)
+ {
+
this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,interpretUnicodeEscapes,ignoreEmptyLines);
+ }
+
+
public void setDelimiter(char delimiter) { this.delimiter = delimiter; }
public char getDelimiter() { return this.delimiter; }
@@ -80,6 +101,9 @@
public void setCommentStart(char commentStart) { this.commentStart =
commentStart; }
public char getCommentStart() { return this.commentStart; }
public boolean isCommentingDisabled() { return this.commentStart ==
COMMENTS_DISABLED; }
+
+ public void setEscape(char escape) { this.escape = escape; }
+ public char getEscape() { return this.escape; }
public void setIgnoreLeadingWhitespaces(boolean ignoreLeadingWhitespaces)
{ this.ignoreLeadingWhitespaces = ignoreLeadingWhitespaces; }
public boolean getIgnoreLeadingWhitespaces() { return
this.ignoreLeadingWhitespaces; }
Modified:
commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVParserTest.java
URL:
http://svn.apache.org/viewvc/commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVParserTest.java?rev=609155&r1=609154&r2=609155&view=diff
==============================================================================
---
commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVParserTest.java
(original)
+++
commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVParserTest.java
Sat Jan 5 07:37:26 2008
@@ -182,9 +182,7 @@
// encapsulator tokenizer (multi line, delimiter in string)
public void testNextToken5() throws IOException {
String code =
- "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\",\"\\\"\""
- + ",\"\\,\""
- + ",\"\"\"\"";
+ "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
TestCSVParser parser = new TestCSVParser(new StringReader(code));
parser.setStrategy(CSVStrategy.DEFAULT_STRATEGY);
System.out.println("---------\n" + code + "\n-------------");
@@ -193,11 +191,8 @@
assertEquals(CSVParser.TT_EORECORD + ";b;", parser.testNextToken());
assertEquals(CSVParser.TT_EORECORD + ";foo\n baar ,,,;",
parser.testNextToken());
- assertEquals(CSVParser.TT_TOKEN + ";\n\t \n;", parser.testNextToken());
- assertEquals(CSVParser.TT_TOKEN + ";\";", parser.testNextToken());
- // escape char in quoted input only escapes delimiter
- assertEquals(CSVParser.TT_TOKEN + ";\\,;", parser.testNextToken());
- assertEquals(CSVParser.TT_EOF + ";\";", parser.testNextToken());
+ assertEquals(CSVParser.TT_EOF + ";\n\t \n;", parser.testNextToken());
+
}
// change delimiters, comment, encapsulater
@@ -207,7 +202,7 @@
* !comment;;;;
* ;;
*/
- String code = "a;'b and \\' more\n'\n!comment;;;;\n;;";
+ String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
TestCSVParser parser = new TestCSVParser(new StringReader(code));
parser.setStrategy( new CSVStrategy(';', '\'', '!') );
System.out.println("---------\n" + code + "\n-------------");
@@ -226,8 +221,9 @@
"a,b,c,d\n"
+ " a , b , 1 2 \n"
+ "\"foo baar\", b,\n"
- + " \"foo\n,,\n\"\",,\n\\\"\",d,e\n";
- String[][] res = {
+ // + " \"foo\n,,\n\"\",,\n\\\"\",d,e\n";
+ + " \"foo\n,,\n\"\",,\n\"\"\",d,e\n"; // changed to use standard CSV
escaping
+ String[][] res = {
{"a", "b", "c", "d"},
{"a", "b", "1 2"},
{"foo baar", "b", ""},
@@ -439,7 +435,7 @@
}
}
- public void testBackslashEscaping() throws IOException {
+ public void OLDtestBackslashEscaping() throws IOException {
String code =
"one,two,three\n"
+ "on\\\"e,two\n"
@@ -474,6 +470,49 @@
}
}
+ public void testBackslashEscaping() throws IOException {
+
+ // To avoid confusion over the need for escaping chars in java code,
+ // We will test with a forward slash as the escape char, and a single
+ // quote as the encapsulator.
+
+ String code =
+ "one,two,three\n" // 0
+ + "'',''\n" // 1) empty encapsulators
+ + "/',/'\n" // 2) single encapsulators
+ + "'/'','/''\n" // 3) single encapsulators encapsulated via escape
+ + "'''',''''\n" // 4) single encapsulators encapsulated via doubling
+ + "/,,/,\n" // 5) separator escaped
+ + "//,//\n" // 6) escape escaped
+ + "'//','//'\n" // 7) escape escaped in encapsulation
+ + "";
+ String[][] res = {
+ { "one", "two", "three" }, // 0
+ { "", "" }, // 1
+ { "'", "'" }, // 2
+ { "'", "'" }, // 3
+ { "'", "'" }, // 4
+ { ",", "," }, // 5
+ { "/", "/" }, // 6
+ { "/", "/" }, // 7
+ };
+
+
+ CSVStrategy strategy = new
CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',true,true,true);
+
+ CSVParser parser = new CSVParser(new StringReader(code), strategy);
+ System.out.println("---------\n" + code + "\n-------------");
+ String[][] tmp = parser.getAllValues();
+ assertTrue(tmp.length > 0);
+ for (int i = 0; i < res.length; i++) {
+ for (int j = 0; j < tmp[i].length; j++) {
+ System.out.println("'" + tmp[i][j] + "' should be '" + res[i][j] +
"'");
+ }
+ assertTrue(Arrays.equals(res[i], tmp[i]));
+ }
+ }
+
+
public void testUnicodeEscape() throws IOException {
String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063";
CSVParser parser = new CSVParser(new StringReader(code));
Modified:
commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVStrategyTest.java
URL:
http://svn.apache.org/viewvc/commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVStrategyTest.java?rev=609155&r1=609154&r2=609155&view=diff
==============================================================================
---
commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVStrategyTest.java
(original)
+++
commons/sandbox/csv/trunk/src/test/org/apache/commons/csv/CSVStrategyTest.java
Sat Jan 5 07:37:26 2008
@@ -91,7 +91,7 @@
// default settings
assertEquals(strategy.getDelimiter(), ',');
assertEquals(strategy.getEncapsulator(), '"');
- assertEquals(strategy.getCommentStart(), '\0');
+ assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
assertEquals(true, strategy.getIgnoreLeadingWhitespaces());
assertEquals(false, strategy.getUnicodeEscapeInterpretation());
assertEquals(true, strategy.getIgnoreEmptyLines());
@@ -99,7 +99,7 @@
parser.setStrategy(CSVStrategy.DEFAULT_STRATEGY);
assertEquals(strategy.getDelimiter(), ',');
assertEquals(strategy.getEncapsulator(), '"');
- assertEquals(strategy.getCommentStart(), '\0');
+ assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
assertEquals(true, strategy.getIgnoreLeadingWhitespaces());
assertEquals(false, strategy.getUnicodeEscapeInterpretation());
assertEquals(true, strategy.getIgnoreEmptyLines());
@@ -109,7 +109,7 @@
CSVStrategy strategy = CSVStrategy.EXCEL_STRATEGY;
assertEquals(strategy.getDelimiter(), ',');
assertEquals(strategy.getEncapsulator(), '"');
- assertEquals(strategy.getCommentStart(), '\0');
+ assertEquals(strategy.getCommentStart(), CSVStrategy.COMMENTS_DISABLED);
assertEquals(false, strategy.getIgnoreLeadingWhitespaces());
assertEquals(false, strategy.getUnicodeEscapeInterpretation());
assertEquals(false, strategy.getIgnoreEmptyLines());