[AdvPaste]CSV parser: Handle control characters (quotation marks) correctly (#33986)

## Summary of the Pull Request

This PR fixes the csv parsing related to quotation marks according to
csv standard rules:
- An empty data value can be written as `""`. => Remove both quotation
marks.
- Enclosing data by starting and ending with `"` if they contain the
delimiter. => First and last quotation mark has to be removed.
- Escape quotation mark with second quotation mark. => Replace pairs of
two with a single one.

### Input
```csv
A,B,,"","my ""nice"" string","""zz""","""""double quotes"""""
```

### Before this PR (Wrong result)
```json
[
  [
    "A",
    "B",
    "",
    "\"\"",
    "\"my \"\"nice\"\" string\"",
    "\"\"\"zz\"\"\"",
    "\"\"\"\"\"double quotes\"\"\"\"\""
  ]
]
```

### After this PR (Correct result)
```json
[
  [
    "A",
    "B",
    "",
    "",
    "my \"nice\" string",
    "\"zz\"",
    "\"\"double quotes\"\""
  ]
]
```
This commit is contained in:
Heiko 2024-07-25 15:05:07 +02:00 committed by GitHub
parent d40367a860
commit 4fee37c35a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -24,9 +24,14 @@ namespace AdvancedPaste.Helpers
private static readonly char[] CsvDelimArry = [',', ';', '\t'];
private static readonly Regex CsvSepIdentifierRegex = new Regex(@"^sep=(.)$", RegexOptions.IgnoreCase);
// Split on every occurrence of the delimiter except if it is enclosed by " and ignore two " as escaped "
// CSV: Split on every occurrence of the delimiter except if it is enclosed by " and ignore two " as escaped "
private static readonly string CsvDelimSepRegexStr = @"(?=(?:[^""]*""[^""]*"")*(?![^""]*""))";
// CSV: Regex to remove/replace quotation marks
private static readonly Regex CsvRemoveSingleQuotationMarksRegex = new Regex(@"^""(?!"")|(?<!"")""$|^""""$");
private static readonly Regex CsvRemoveStartAndEndQuotationMarksRegex = new Regex(@"^""(?=(""{2})+)|(?<=(""{2})+)""$");
private static readonly Regex CsvReplaceDoubleQuotationMarksRegex = new Regex(@"""{2}");
internal static string ToJsonFromXmlOrCsv(DataPackageView clipboardData)
{
Logger.LogTrace();
@ -134,7 +139,7 @@ namespace AdvancedPaste.Helpers
{
if (string.IsNullOrEmpty(jsonText))
{
var csv = new List<string[]>();
var csv = new List<IEnumerable<string>>();
string[] lines = text.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);
@ -153,7 +158,8 @@ namespace AdvancedPaste.Helpers
// and if every line contains no or an even count of quotation marks.
if (Regex.Count(line, delim + CsvDelimSepRegexStr) == delimCount && int.IsEvenInteger(line.Count(x => x == '"')))
{
csv.Add(Regex.Split(line, delim + CsvDelimSepRegexStr, RegexOptions.IgnoreCase));
string[] dataCells = Regex.Split(line, delim + CsvDelimSepRegexStr, RegexOptions.IgnoreCase);
csv.Add(dataCells.Select(x => ReplaceQuotationMarksInCsvData(x)));
}
else
{
@ -244,5 +250,26 @@ namespace AdvancedPaste.Helpers
throw new FormatException("Invalid CSV format: Failed to detect the delimiter.");
}
}
/// <summary>
/// Remove and replace quotation marks used as control sequences. (Enclosing quotation marks and escaping quotation marks.)
/// </summary>
/// <param name="str">CSV cell data to manipulate.</param>
/// <returns>Manipulated string.</returns>
private static string ReplaceQuotationMarksInCsvData(string str)
{
// Remove first and last single quotation mark (enclosing quotation marks) and remove quotation marks of an empty data set ("").
str = CsvRemoveSingleQuotationMarksRegex.Replace(str, string.Empty);
// Remove first quotation mark if followed by pairs of quotation marks
// and remove last quotation mark if precede by pairs of quotation marks.
// (Removes enclosing quotation marks around the cell data for data like /"""abc"""/.)
str = CsvRemoveStartAndEndQuotationMarksRegex.Replace(str, string.Empty);
// Replace pairs of two quotation marks with a single quotation mark. (Escaped quotation marks.)
str = CsvReplaceDoubleQuotationMarksRegex.Replace(str, "\"");
return str;
}
}
}