From 3652e3627ab06f9a2d436b575162753e750939e7 Mon Sep 17 00:00:00 2001 From: Vaibhav Sharma <48472541+GhostVaibhav@users.noreply.github.com> Date: Tue, 23 Jul 2024 18:59:33 +0530 Subject: [PATCH] [AdvPaste]Fix CSV parser supporting escape delimiter by enclosing in double quotes (#33874) ## Summary of the Pull Request This PR fixes the CSV parser support for escaping delimiter by enclosing it in quotes ## Detailed Description of the Pull Request / Additional comments - This PR introduces a fix for the support of adding a delimiter to the string and supporting it by enclosing it in `"` --- .github/actions/spell-check/expect.txt | 1 + .../AdvancedPaste/Helpers/JsonHelper.cs | 20 +++++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/.github/actions/spell-check/expect.txt b/.github/actions/spell-check/expect.txt index 27bfb943ab..1299374146 100644 --- a/.github/actions/spell-check/expect.txt +++ b/.github/actions/spell-check/expect.txt @@ -1064,6 +1064,7 @@ numberbox nwc Objbase objidl +occurrence ocr Ocrsettings odbccp diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Helpers/JsonHelper.cs b/src/modules/AdvancedPaste/AdvancedPaste/Helpers/JsonHelper.cs index 680308f519..fc23a6de5b 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Helpers/JsonHelper.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Helpers/JsonHelper.cs @@ -24,6 +24,9 @@ namespace AdvancedPaste.Helpers private static readonly char[] CsvDelimArry = [',', ';', '\t']; private static readonly Regex CsvSepIdentifierRegex = new Regex(@"^sep=(.)$", RegexOptions.IgnoreCase); + // Split on every occurrence of the delimiter except if it is enclosed by " and ignore two " as escaped " + private static readonly string CsvDelimSepRegexStr = @"(?=(?:[^""]*""[^""]*"")*(?![^""]*""))"; + internal static string ToJsonFromXmlOrCsv(DataPackageView clipboardData) { Logger.LogTrace(); @@ -146,10 +149,11 @@ namespace AdvancedPaste.Helpers continue; } - // A CSV line is valid, if the delimiter occurs more or equal times in every line compared to the first data line. (More because sometimes the delimiter occurs in a data string.) - if (line.Count(x => x == delim) >= delimCount) + // A CSV line is valid, if the delimiter occurs equal times in every line compared to the first data line + // and if every line contains no or an even count of quotation marks. + if (Regex.Count(line, delim + CsvDelimSepRegexStr) == delimCount && int.IsEvenInteger(line.Count(x => x == '"'))) { - csv.Add(line.Split(delim)); + csv.Add(Regex.Split(line, delim + CsvDelimSepRegexStr, RegexOptions.IgnoreCase)); } else { @@ -205,7 +209,7 @@ namespace AdvancedPaste.Helpers // We get the count from the second line, as the first one only contains the character definition and not a CSV data line. char delimChar = matchChar.Groups[1].Value.Trim()[0]; delimiter = delimChar; - delimiterCount = csvLines[1].Count(x => x == delimChar); + delimiterCount = Regex.Count(csvLines[1], delimChar + CsvDelimSepRegexStr, RegexOptions.IgnoreCase); } } @@ -214,19 +218,19 @@ namespace AdvancedPaste.Helpers // Try to select the correct delimiter based on the first two CSV lines from a list of predefined delimiters. foreach (char c in CsvDelimArry) { - int cntFirstLine = csvLines[0].Count(x => x == c); + int cntFirstLine = Regex.Count(csvLines[0], c + CsvDelimSepRegexStr, RegexOptions.IgnoreCase); int cntNextLine = 0; // Default to 0 that the 'second line' check is always true. // Additional count if we have more than one line if (csvLines.Length >= 2) { - cntNextLine = csvLines[1].Count(x => x == c); + cntNextLine = Regex.Count(csvLines[1], c + CsvDelimSepRegexStr, RegexOptions.IgnoreCase); } // The delimiter is found if the count is bigger as from the last selected delimiter - // and if the next csv line does not exist or has the same number or more occurrences of the delimiter. + // and if the next csv line does not exist or has the same number of occurrences of the delimiter. // (We check the next line to prevent false positives.) - if (cntFirstLine > delimiterCount && (cntNextLine == 0 || cntNextLine >= cntFirstLine)) + if (cntFirstLine > delimiterCount && (cntNextLine == 0 || cntNextLine == cntFirstLine)) { delimiter = c; delimiterCount = cntFirstLine;