From 1843bcdfa6fa2702886249d26c303c6889a6e9a2 Mon Sep 17 00:00:00 2001 From: Aleksei Smirnov Date: Wed, 9 Oct 2024 18:46:43 +0300 Subject: [PATCH] Fix dataframe incorrectly parse CSV when renameDuplicatedColumns is true (#7242) * Fix dataframe incorrectly parse CSV when renameDuplicatedColumns is true * Update System.Numerics.Tensors dependency --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 3 ++- .../Microsoft.Data.Analysis.csproj | 2 +- .../DataFrame.IOTests.cs | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index 3714fb18c2..4d8fb1487a 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -388,7 +388,8 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe // First pass: schema and number of rows. while ((fields = parser.ReadFields()) != null) { - if (renameDuplicatedColumns) + //Only first row contains column names + if (renameDuplicatedColumns && rowline == 0) { var names = new Dictionary(); diff --git a/src/Microsoft.Data.Analysis/Microsoft.Data.Analysis.csproj b/src/Microsoft.Data.Analysis/Microsoft.Data.Analysis.csproj index aeaea39c32..a7462d1949 100644 --- a/src/Microsoft.Data.Analysis/Microsoft.Data.Analysis.csproj +++ b/src/Microsoft.Data.Analysis/Microsoft.Data.Analysis.csproj @@ -46,7 +46,7 @@ - + diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index 4b0a2a5fd0..441c581ef6 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -119,6 +119,25 @@ private static Stream GetStream(string streamData) return new MemoryStream(Encoding.Default.GetBytes(streamData)); } + [Fact] + public void TestReadCsvWithHeaderCultureInfoAndColumnTypeAutoGuess() + { + //see https://github.com/dotnet/machinelearning/issues/7240 + + CultureInfo.CurrentCulture = CultureInfo.InvariantCulture; // or en-US + + string csv = +@"""Col1"",""Col2"",""Col3"",""Col4"" +""v1.1"",""5/7/2017"",""v3.1"",""v4.1"" +"""","""",""v3.2"",""v4.2"" +"; + + var dataFrame = DataFrame.LoadCsvFromString(csv, separator: ',', header: true, + dataTypes: null, // guess the column types + renameDuplicatedColumns: true, // try to rename the duplicated columns, if any + cultureInfo: CultureInfo.InvariantCulture); + } + [Theory] [InlineData(false)] [InlineData(true)]