Super! I will test it out soonish -- maybe next week. On Mon, Apr 18, 2016 at 10:39 AM, Damjan Jovanovic <[email protected]> wrote:
> Furthermore, the new behaviour both fixes #126805 and matches Excel's > behaviour on the same tests, so I am very happy. > > On Mon, Apr 18, 2016 at 2:05 AM, Damjan Jovanovic <[email protected]> > wrote: > > The way the CSV field parsers in both Calc and Base work is that a > > quoted field is only quoted up until the earliest matching quote > > character that has no adjacent quote to escape it. The text after it, > > and until the field separator, is unquoted. See > > QuotedTokenizedString::GetTokenSpecial() in > > main/connectivity/source/drivers/file/quotedstring.cxx for Base, and > > ScImportExport::ScanNextFieldFromString() in > > main/sc/source/ui/docshell/impex.cxx for Calc in which a comment calls > > this "Append remaining unquoted and undelimited data (dirty, dirty) to > > this field". > > > > "abc"d is parsed as [abcd], and "another " " as [another "]. It's not > > clear why this was done, but it is clear that it was done > > intentionally. > > > > Damjan > > > > On Sun, Apr 17, 2016 at 10:34 PM, Dennis E. Hamilton > > <[email protected]> wrote: > >> Does the rule about using "" to make a single quote inside a quoted > field also apply? > >> > >> - Dennis > >> > >>> -----Original Message----- > >>> From: [email protected] [mailto:[email protected]] > >>> Sent: Sunday, April 17, 2016 09:45 > >>> To: [email protected] > >>> Subject: svn commit: r1739628 - in /openoffice/trunk/main: > >>> connectivity/source/drivers/flat/ETable.cxx > >>> tools/source/stream/stream.cxx > >>> > >>> Author: damjan > >>> Date: Sun Apr 17 16:44:43 2016 > >>> New Revision: 1739628 > >>> > >>> URL: http://svn.apache.org/viewvc?rev=1739628&view=rev > >>> Log: > >>> Make CSV line parsers consistent with CSV field parsers. > >>> > >>> Our CSV field parsing algorithms treats fields starting with a quote > >>> (immediately at the beginning of the row, or after the field delimiter) > >>> as > >>> quoted. A quoted field ends at the corresponding closing quote, and any > >>> remaining text between the closing quote and the next field delimeter > or > >>> end > >>> of line is appended to the text already extracted from the field, but > >>> not > >>> processed further. Any quotes in this extra text are taken verbatim - > >>> they > >>> do not quote anything. > >>> > >>> Our CSV line parsers were big hacks - they essentially read and > >>> concatenate > >>> lines until an even number of quote characters is found, and then feed > >>> this > >>> through the CSV field parsers. > >>> > >>> This patch rewrites the line parsers to work exactly how the field > >>> parsers > >>> work. Text such as: > >>> "another" ",something else > >>> is now correctly parsed by both Calc and Base as: > >>> [another "],[something else] > >>> instead of breaking all further parsing. > >>> > >>> Patch by: me > >>> > >>> > >>> Modified: > >>> openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx > >>> openoffice/trunk/main/tools/source/stream/stream.cxx > >>> > >>> Modified: > >>> openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx > >>> URL: > >>> > http://svn.apache.org/viewvc/openoffice/trunk/main/connectivity/source/d > >>> rivers/flat/ETable.cxx?rev=1739628&r1=1739627&r2=1739628&view=diff > >>> > ======================================================================== > >>> ====== > >>> --- openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx > >>> (original) > >>> +++ openoffice/trunk/main/connectivity/source/drivers/flat/ETable.cxx > >>> Sun Apr 17 16:44:43 2016 > >>> @@ -907,14 +907,64 @@ sal_Bool OFlatTable::readLine(QuotedToke > >>> return sal_False; > >>> > >>> QuotedTokenizedString sLine = line; // check if the string > >>> continues on next line > >>> - while( (sLine.GetString().GetTokenCount(m_cStringDelimiter) % 2) > != > >>> 1 ) > >>> + xub_StrLen nLastOffset = 0; > >>> + bool isQuoted = false; > >>> + bool isFieldStarting = true; > >>> + while (true) > >>> { > >>> - m_pFileStream->ReadByteStringLine(sLine,nEncoding); > >>> - if ( !m_pFileStream->IsEof() ) > >>> + bool wasQuote = false; > >>> + const sal_Unicode *p; > >>> + p = sLine.GetString().GetBuffer(); > >>> + p += nLastOffset; > >>> + > >>> + while (*p) > >>> + { > >>> + if (isQuoted) > >>> + { > >>> + if (*p == m_cStringDelimiter) > >>> + wasQuote = !wasQuote; > >>> + else > >>> + { > >>> + if (wasQuote) > >>> + { > >>> + wasQuote = false; > >>> + isQuoted = false; > >>> + if (*p == m_cFieldDelimiter) > >>> + isFieldStarting = true; > >>> + } > >>> + } > >>> + } > >>> + else > >>> + { > >>> + if (isFieldStarting) > >>> + { > >>> + isFieldStarting = false; > >>> + if (*p == m_cStringDelimiter) > >>> + isQuoted = true; > >>> + else if (*p == m_cFieldDelimiter) > >>> + isFieldStarting = true; > >>> + } > >>> + else if (*p == m_cFieldDelimiter) > >>> + isFieldStarting = true; > >>> + } > >>> + ++p; > >>> + } > >>> + > >>> + if (wasQuote) > >>> + isQuoted = false; > >>> + > >>> + if (isQuoted) > >>> { > >>> - line.GetString().Append('\n'); > >>> - line.GetString() += sLine.GetString(); > >>> - sLine = line; > >>> + nLastOffset = sLine.Len(); > >>> + m_pFileStream->ReadByteStringLine(sLine,nEncoding); > >>> + if ( !m_pFileStream->IsEof() ) > >>> + { > >>> + line.GetString().Append('\n'); > >>> + line.GetString() += sLine.GetString(); > >>> + sLine = line; > >>> + } > >>> + else > >>> + break; > >>> } > >>> else > >>> break; > >>> > >>> Modified: openoffice/trunk/main/tools/source/stream/stream.cxx > >>> URL: > >>> > http://svn.apache.org/viewvc/openoffice/trunk/main/tools/source/stream/s > >>> tream.cxx?rev=1739628&r1=1739627&r2=1739628&view=diff > >>> > ======================================================================== > >>> ====== > >>> --- openoffice/trunk/main/tools/source/stream/stream.cxx (original) > >>> +++ openoffice/trunk/main/tools/source/stream/stream.cxx Sun Apr 17 > >>> 16:44:43 2016 > >>> @@ -1128,38 +1128,59 @@ sal_Bool SvStream::ReadCsvLine( String& > >>> { > >>> const sal_Unicode* pSeps = rFieldSeparators.GetBuffer(); > >>> xub_StrLen nLastOffset = 0; > >>> - xub_StrLen nQuotes = 0; > >>> + bool isQuoted = false; > >>> + bool isFieldStarting = true; > >>> while (!IsEof() && rStr.Len() < STRING_MAXLEN) > >>> { > >>> + bool wasQuote = false; > >>> bool bBackslashEscaped = false; > >>> - const sal_Unicode *p, *pStart; > >>> - p = pStart = rStr.GetBuffer(); > >>> + const sal_Unicode *p; > >>> + p = rStr.GetBuffer(); > >>> p += nLastOffset; > >>> while (*p) > >>> { > >>> - if (nQuotes) > >>> + if (isQuoted) > >>> { > >>> if (*p == cFieldQuote && !bBackslashEscaped) > >>> - ++nQuotes; > >>> - else if (bAllowBackslashEscape) > >>> + wasQuote = !wasQuote; > >>> + else > >>> { > >>> - if (*p == '\\') > >>> - bBackslashEscaped = !bBackslashEscaped; > >>> - else > >>> - bBackslashEscaped = false; > >>> + if (bAllowBackslashEscape) > >>> + { > >>> + if (*p == '\\') > >>> + bBackslashEscaped = > !bBackslashEscaped; > >>> + else > >>> + bBackslashEscaped = false; > >>> + } > >>> + if (wasQuote) > >>> + { > >>> + wasQuote = false; > >>> + isQuoted = false; > >>> + if (lcl_UnicodeStrChr( pSeps, *p )) > >>> + isFieldStarting = true; > >>> + } > >>> } > >>> } > >>> - else if (*p == cFieldQuote && (p == pStart || > >>> - lcl_UnicodeStrChr( pSeps, p[-1]))) > >>> - nQuotes = 1; > >>> - // A quote character inside a field content does not > >>> start > >>> - // a quote. > >>> + else > >>> + { > >>> + if (isFieldStarting) > >>> + { > >>> + isFieldStarting = false; > >>> + if (*p == cFieldQuote) > >>> + isQuoted = true; > >>> + else if (lcl_UnicodeStrChr( pSeps, *p )) > >>> + isFieldStarting = true; > >>> + } > >>> + else if (lcl_UnicodeStrChr( pSeps, *p )) > >>> + isFieldStarting = true; > >>> + } > >>> ++p; > >>> } > >>> > >>> - if (nQuotes % 2 == 0) > >>> - break; > >>> - else > >>> + if (wasQuote) > >>> + isQuoted = false; > >>> + > >>> + if (isQuoted) > >>> { > >>> nLastOffset = rStr.Len(); > >>> String aNext; > >>> @@ -1167,6 +1188,8 @@ sal_Bool SvStream::ReadCsvLine( String& > >>> rStr += sal_Unicode(_LF); > >>> rStr += aNext; > >>> } > >>> + else > >>> + break; > >>> } > >>> } > >>> return nError == SVSTREAM_OK; > >> > >> > >> > >> --------------------------------------------------------------------- > >> To unsubscribe, e-mail: [email protected] > >> For additional commands, e-mail: [email protected] > >> > > --------------------------------------------------------------------- > To unsubscribe, e-mail: [email protected] > For additional commands, e-mail: [email protected] > > -- ---------------------------------------------------------------------- MzK "Time spent with cats is never wasted." -- Sigmund Freud
