From 4fee37c35a148947b9f487e657c883bcc5cdea94 Mon Sep 17 00:00:00 2001 From: Heiko <61519853+htcfreek@users.noreply.github.com> Date: Thu, 25 Jul 2024 15:05:07 +0200 Subject: [PATCH] [AdvPaste]CSV parser: Handle control characters (quotation marks) correctly (#33986) ## Summary of the Pull Request This PR fixes the csv parsing related to quotation marks according to csv standard rules: - An empty data value can be written as `""`. => Remove both quotation marks. - Enclosing data by starting and ending with `"` if they contain the delimiter. => First and last quotation mark has to be removed. - Escape quotation mark with second quotation mark. => Replace pairs of two with a single one. ### Input ```csv A,B,,"","my ""nice"" string","""zz""","""""double quotes""""" ``` ### Before this PR (Wrong result) ```json [ [ "A", "B", "", "\"\"", "\"my \"\"nice\"\" string\"", "\"\"\"zz\"\"\"", "\"\"\"\"\"double quotes\"\"\"\"\"" ] ] ``` ### After this PR (Correct result) ```json [ [ "A", "B", "", "", "my \"nice\" string", "\"zz\"", "\"\"double quotes\"\"" ] ] ``` --- .../AdvancedPaste/Helpers/JsonHelper.cs | 33 +++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/src/modules/AdvancedPaste/AdvancedPaste/Helpers/JsonHelper.cs b/src/modules/AdvancedPaste/AdvancedPaste/Helpers/JsonHelper.cs index fc23a6de5b..3cff93493e 100644 --- a/src/modules/AdvancedPaste/AdvancedPaste/Helpers/JsonHelper.cs +++ b/src/modules/AdvancedPaste/AdvancedPaste/Helpers/JsonHelper.cs @@ -24,9 +24,14 @@ namespace AdvancedPaste.Helpers private static readonly char[] CsvDelimArry = [',', ';', '\t']; private static readonly Regex CsvSepIdentifierRegex = new Regex(@"^sep=(.)$", RegexOptions.IgnoreCase); - // Split on every occurrence of the delimiter except if it is enclosed by " and ignore two " as escaped " + // CSV: Split on every occurrence of the delimiter except if it is enclosed by " and ignore two " as escaped " private static readonly string CsvDelimSepRegexStr = @"(?=(?:[^""]*""[^""]*"")*(?![^""]*""))"; + // CSV: Regex to remove/replace quotation marks + private static readonly Regex CsvRemoveSingleQuotationMarksRegex = new Regex(@"^""(?!"")|(?(); + var csv = new List>(); string[] lines = text.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries); @@ -153,7 +158,8 @@ namespace AdvancedPaste.Helpers // and if every line contains no or an even count of quotation marks. if (Regex.Count(line, delim + CsvDelimSepRegexStr) == delimCount && int.IsEvenInteger(line.Count(x => x == '"'))) { - csv.Add(Regex.Split(line, delim + CsvDelimSepRegexStr, RegexOptions.IgnoreCase)); + string[] dataCells = Regex.Split(line, delim + CsvDelimSepRegexStr, RegexOptions.IgnoreCase); + csv.Add(dataCells.Select(x => ReplaceQuotationMarksInCsvData(x))); } else { @@ -244,5 +250,26 @@ namespace AdvancedPaste.Helpers throw new FormatException("Invalid CSV format: Failed to detect the delimiter."); } } + + /// + /// Remove and replace quotation marks used as control sequences. (Enclosing quotation marks and escaping quotation marks.) + /// + /// CSV cell data to manipulate. + /// Manipulated string. + private static string ReplaceQuotationMarksInCsvData(string str) + { + // Remove first and last single quotation mark (enclosing quotation marks) and remove quotation marks of an empty data set (""). + str = CsvRemoveSingleQuotationMarksRegex.Replace(str, string.Empty); + + // Remove first quotation mark if followed by pairs of quotation marks + // and remove last quotation mark if precede by pairs of quotation marks. + // (Removes enclosing quotation marks around the cell data for data like /"""abc"""/.) + str = CsvRemoveStartAndEndQuotationMarksRegex.Replace(str, string.Empty); + + // Replace pairs of two quotation marks with a single quotation mark. (Escaped quotation marks.) + str = CsvReplaceDoubleQuotationMarksRegex.Replace(str, "\""); + + return str; + } } }