Skip to content

Commit

Permalink
Fix dataframe incorrectly parse CSV when renameDuplicatedColumns is t…
Browse files Browse the repository at this point in the history
…rue (dotnet#7242)

* Fix dataframe incorrectly parse CSV when renameDuplicatedColumns is true

* Update System.Numerics.Tensors dependency
  • Loading branch information
asmirnov82 authored Oct 9, 2024
1 parent 9baf26b commit 1843bcd
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 2 deletions.
3 changes: 2 additions & 1 deletion src/Microsoft.Data.Analysis/DataFrame.IO.cs
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,8 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe
// First pass: schema and number of rows.
while ((fields = parser.ReadFields()) != null)
{
if (renameDuplicatedColumns)
//Only first row contains column names
if (renameDuplicatedColumns && rowline == 0)
{
var names = new Dictionary<string, int>();

Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.Data.Analysis/Microsoft.Data.Analysis.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
</ItemGroup>

<ItemGroup Condition="'$(TargetFramework)' == 'net8.0'">
<PackageReference Include="System.Numerics.Tensors" Version="9.0.0-preview.6.24327.7" />
<PackageReference Include="System.Numerics.Tensors" Version="9.0.0-rc.1.24431.7" />
</ItemGroup>

<ItemGroup Condition="'$(TargetFrameworkIdentifier)' == '.NETStandard'">
Expand Down
19 changes: 19 additions & 0 deletions test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,25 @@ private static Stream GetStream(string streamData)
return new MemoryStream(Encoding.Default.GetBytes(streamData));
}

[Fact]
public void TestReadCsvWithHeaderCultureInfoAndColumnTypeAutoGuess()
{
//see https://github.com/dotnet/machinelearning/issues/7240

CultureInfo.CurrentCulture = CultureInfo.InvariantCulture; // or en-US

string csv =
@"""Col1"",""Col2"",""Col3"",""Col4""
""v1.1"",""5/7/2017"",""v3.1"",""v4.1""
"""","""",""v3.2"",""v4.2""
";

var dataFrame = DataFrame.LoadCsvFromString(csv, separator: ',', header: true,
dataTypes: null, // guess the column types
renameDuplicatedColumns: true, // try to rename the duplicated columns, if any
cultureInfo: CultureInfo.InvariantCulture);
}

[Theory]
[InlineData(false)]
[InlineData(true)]
Expand Down

0 comments on commit 1843bcd

Please sign in to comment.