Furthermore, the new behaviour both fixes #126805 and matches Excel's behaviour on the same tests, so I am very happy.
On Mon, Apr 18, 2016 at 2:05 AM, Damjan Jovanovic <[email protected]> wrote: > The way the CSV field parsers in both Calc and Base work is that a > quoted field is only quoted up until the earliest matching quote > character that has no adjacent quote to escape it. The text after it, > and until the field separator, is unquoted. See > QuotedTokenizedString::GetTokenSpecial() in > main/connectivity/source/drivers/file/quotedstring.cxx for Base, and > ScImportExport::ScanNextFieldFromString() in > main/sc/source/ui/docshell/impex.cxx for Calc in which a comment calls > this "Append remaining unquoted and undelimited data (dirty, dirty) to > this field". > > "abc"d is parsed as [abcd], and "another " " as [another "]. It's not > clear why this was done, but it is clear that it was done > intentionally. > > Damjan > > On Sun, Apr 17, 2016 at 10:34 PM, Dennis E. Hamilton > <[email protected]> wrote: >> Does the rule about using "" to make a single quote inside a quoted field >> also apply? >> >> - Dennis >> >>> -----Original Message----- >>> From: [email protected] [mailto:[email protected]] >>> Sent: Sunday, April 17, 2016 09:45 >>> To: [email protected] >>> Subject: svn commit: r1739628 - in /openoffice/trunk/main: >>> connectivity/source/drivers/flat/ETable.cxx >>> tools/source/stream/stream.cxx >>> >>> Author: damjan >>> Date: Sun Apr 17 16:44:43 2016 >>> New Revision: 1739628 >>> >>> URL: http://svn.apache.org/viewvc?rev=1739628&view=rev >>> Log: >>> Make CSV line parsers consistent with CSV field parsers. >>> >>> Our CSV field parsing algorithms treats fields starting with a quote >>> (immediately at the beginning of the row, or after the field delimiter) >>> as >>> quoted. A quoted field ends at the corresponding closing quote, and any >>> remaining text between the closing quote and the next field delimeter or >>> end >>> of line is appended to the text already extracted from the field, but >>> not >>> processed further. Any quotes in this extra text are taken verbatim - >>> they >>> do not quote anything. >>> >>> Our CSV line parsers were big hacks - they essentially read and >>> concatenate >>> lines until an even number of quote characters is found, and then feed >>> this >>> through the CSV field parsers. >>> >>> This patch rewrites the line parsers to work exactly how the field >>> parsers >>> work. Text such as: >>> "another" ",something else >>> is now correctly parsed by both Calc and Base as: >>> [another "],[something else] >>> instead of breaking all further parsing. >>> >>> Patch by: me >>> >>> >>> Modified: >>> openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx >>> openoffice/trunk/main/tools/source/stream/stream.cxx >>> >>> Modified: >>> openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx >>> URL: >>> http://svn.apache.org/viewvc/openoffice/trunk/main/connectivity/source/d >>> rivers/flat/ETable.cxx?rev=1739628&r1=1739627&r2=1739628&view=diff >>> ======================================================================== >>> ====== >>> --- openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx >>> (original) >>> +++ openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx >>> Sun Apr 17 16:44:43 2016 >>> @@ -907,14 +907,64 @@ sal_Bool OFlatTable::readLine(QuotedToke >>> return sal_False; >>> >>> QuotedTokenizedString sLine = line; // check if the string >>> continues on next line >>> - while( (sLine.GetString().GetTokenCount(m_cStringDelimiter) % 2) != >>> 1 ) >>> + xub_StrLen nLastOffset = 0; >>> + bool isQuoted = false; >>> + bool isFieldStarting = true; >>> + while (true) >>> { >>> - m_pFileStream->ReadByteStringLine(sLine,nEncoding); >>> - if ( !m_pFileStream->IsEof() ) >>> + bool wasQuote = false; >>> + const sal_Unicode *p; >>> + p = sLine.GetString().GetBuffer(); >>> + p += nLastOffset; >>> + >>> + while (*p) >>> + { >>> + if (isQuoted) >>> + { >>> + if (*p == m_cStringDelimiter) >>> + wasQuote = !wasQuote; >>> + else >>> + { >>> + if (wasQuote) >>> + { >>> + wasQuote = false; >>> + isQuoted = false; >>> + if (*p == m_cFieldDelimiter) >>> + isFieldStarting = true; >>> + } >>> + } >>> + } >>> + else >>> + { >>> + if (isFieldStarting) >>> + { >>> + isFieldStarting = false; >>> + if (*p == m_cStringDelimiter) >>> + isQuoted = true; >>> + else if (*p == m_cFieldDelimiter) >>> + isFieldStarting = true; >>> + } >>> + else if (*p == m_cFieldDelimiter) >>> + isFieldStarting = true; >>> + } >>> + ++p; >>> + } >>> + >>> + if (wasQuote) >>> + isQuoted = false; >>> + >>> + if (isQuoted) >>> { >>> - line.GetString().Append('\n'); >>> - line.GetString() += sLine.GetString(); >>> - sLine = line; >>> + nLastOffset = sLine.Len(); >>> + m_pFileStream->ReadByteStringLine(sLine,nEncoding); >>> + if ( !m_pFileStream->IsEof() ) >>> + { >>> + line.GetString().Append('\n'); >>> + line.GetString() += sLine.GetString(); >>> + sLine = line; >>> + } >>> + else >>> + break; >>> } >>> else >>> break; >>> >>> Modified: openoffice/trunk/main/tools/source/stream/stream.cxx >>> URL: >>> http://svn.apache.org/viewvc/openoffice/trunk/main/tools/source/stream/s >>> tream.cxx?rev=1739628&r1=1739627&r2=1739628&view=diff >>> ======================================================================== >>> ====== >>> --- openoffice/trunk/main/tools/source/stream/stream.cxx (original) >>> +++ openoffice/trunk/main/tools/source/stream/stream.cxx Sun Apr 17 >>> 16:44:43 2016 >>> @@ -1128,38 +1128,59 @@ sal_Bool SvStream::ReadCsvLine( String& >>> { >>> const sal_Unicode* pSeps = rFieldSeparators.GetBuffer(); >>> xub_StrLen nLastOffset = 0; >>> - xub_StrLen nQuotes = 0; >>> + bool isQuoted = false; >>> + bool isFieldStarting = true; >>> while (!IsEof() && rStr.Len() < STRING_MAXLEN) >>> { >>> + bool wasQuote = false; >>> bool bBackslashEscaped = false; >>> - const sal_Unicode *p, *pStart; >>> - p = pStart = rStr.GetBuffer(); >>> + const sal_Unicode *p; >>> + p = rStr.GetBuffer(); >>> p += nLastOffset; >>> while (*p) >>> { >>> - if (nQuotes) >>> + if (isQuoted) >>> { >>> if (*p == cFieldQuote && !bBackslashEscaped) >>> - ++nQuotes; >>> - else if (bAllowBackslashEscape) >>> + wasQuote = !wasQuote; >>> + else >>> { >>> - if (*p == '\\') >>> - bBackslashEscaped = !bBackslashEscaped; >>> - else >>> - bBackslashEscaped = false; >>> + if (bAllowBackslashEscape) >>> + { >>> + if (*p == '\\') >>> + bBackslashEscaped = !bBackslashEscaped; >>> + else >>> + bBackslashEscaped = false; >>> + } >>> + if (wasQuote) >>> + { >>> + wasQuote = false; >>> + isQuoted = false; >>> + if (lcl_UnicodeStrChr( pSeps, *p )) >>> + isFieldStarting = true; >>> + } >>> } >>> } >>> - else if (*p == cFieldQuote && (p == pStart || >>> - lcl_UnicodeStrChr( pSeps, p[-1]))) >>> - nQuotes = 1; >>> - // A quote character inside a field content does not >>> start >>> - // a quote. >>> + else >>> + { >>> + if (isFieldStarting) >>> + { >>> + isFieldStarting = false; >>> + if (*p == cFieldQuote) >>> + isQuoted = true; >>> + else if (lcl_UnicodeStrChr( pSeps, *p )) >>> + isFieldStarting = true; >>> + } >>> + else if (lcl_UnicodeStrChr( pSeps, *p )) >>> + isFieldStarting = true; >>> + } >>> ++p; >>> } >>> >>> - if (nQuotes % 2 == 0) >>> - break; >>> - else >>> + if (wasQuote) >>> + isQuoted = false; >>> + >>> + if (isQuoted) >>> { >>> nLastOffset = rStr.Len(); >>> String aNext; >>> @@ -1167,6 +1188,8 @@ sal_Bool SvStream::ReadCsvLine( String& >>> rStr += sal_Unicode(_LF); >>> rStr += aNext; >>> } >>> + else >>> + break; >>> } >>> } >>> return nError == SVSTREAM_OK; >> >> >> >> --------------------------------------------------------------------- >> To unsubscribe, e-mail: [email protected] >> For additional commands, e-mail: [email protected] >> --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
