From 709f2267305fed4f1878678ef6b8acf7464587b7 Mon Sep 17 00:00:00 2001
From: James McKinney <26463+jpmckinney@users.noreply.github.com>
Date: Tue, 17 Oct 2023 23:00:37 -0400
Subject: [PATCH] fix: Reconfigure the encoding of standard input according to
 the --encoding option, closes #1038

---
 CHANGELOG.rst                          |  1 +
 csvkit/cli.py                          |  1 +
 tests/test_convert/test_fixed.py       |  8 ++++----
 tests/test_utilities/test_csvclean.py  |  4 ++--
 tests/test_utilities/test_csvformat.py |  8 ++++----
 tests/test_utilities/test_csvjson.py   |  4 ++--
 tests/test_utilities/test_csvlook.py   |  4 ++--
 tests/test_utilities/test_csvsort.py   |  4 ++--
 tests/test_utilities/test_csvsql.py    | 14 +++++++-------
 tests/test_utilities/test_csvstack.py  | 10 +++++-----
 tests/test_utilities/test_in2csv.py    | 18 +++++++++---------
 tests/test_utilities/test_sql2csv.py   | 10 +++++-----
 tests/utils.py                         | 14 +++++++-------
 13 files changed, 51 insertions(+), 49 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 37310b31f..c5e191514 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -10,6 +10,7 @@ Unreleased
 * :doc:`/scripts/csvstat` adds a :code:`--non-nulls` option to only output counts of non-null values.
 * :doc:`/scripts/csvstat` adds a :code:`--max-precision` option to only output the most decimal places.
 * feat: Add a :code:`--null-value` option to commands with the :code:`--blanks` option, to convert additional values to NULL.
+* fix: Reconfigure the encoding of standard input according to the :code:`--encoding` option, which defaults to ``utf-8-sig``. Affected users no longer need to set the ``PYTHONIOENCODING`` environment variable.
 * fix: Prompt the user if additional input is expected (i.e. if no input file or piped data is provided) in :doc:`/scripts/csvjoin`, :doc:`/scripts/csvsql` and :doc:`/scripts/csvstack`.
 * fix: No longer errors if a NUL byte occurs in an input file.
 * Add Python 3.12 support.
diff --git a/csvkit/cli.py b/csvkit/cli.py
index e62a066a9..0681e07cc 100644
--- a/csvkit/cli.py
+++ b/csvkit/cli.py
@@ -238,6 +238,7 @@ def _open_input_file(self, path):
         Open the input file specified on the command line.
         """
         if not path or path == '-':
+            sys.stdin.reconfigure(encoding=self.args.encoding)
             f = sys.stdin
         else:
             extension = splitext(path)[1]
diff --git a/tests/test_convert/test_fixed.py b/tests/test_convert/test_fixed.py
index 3ce91b2ea..d8f57be78 100644
--- a/tests/test_convert/test_fixed.py
+++ b/tests/test_convert/test_fixed.py
@@ -1,4 +1,4 @@
-from io import StringIO
+import io
 
 from csvkit.convert import fixed
 from csvkit.utilities.in2csv import In2CSV
@@ -23,7 +23,7 @@ def test_fixed_skip_lines(self):
             self.assertEqual(f.read(), output)
 
     def test_fixed_no_inference(self):
-        input_file = StringIO('     1   2 3')
+        input_file = io.BytesIO(b'     1   2 3')
 
         with stdin_as_string(input_file):
             self.assertLines(['--no-inference', '-f', 'fixed', '--schema',
@@ -36,7 +36,7 @@ def test_fixed_no_inference(self):
 
     def test_fixed_streaming(self):
         with open('examples/testfixed') as f, open('examples/testfixed_schema.csv') as schema:
-            output_file = StringIO()
+            output_file = io.StringIO()
             fixed.fixed2csv(f, schema, output=output_file)
             output = output_file.getvalue()
             output_file.close()
@@ -91,7 +91,7 @@ def test_schematic_line_parser(self):
 bar,6,2
 baz,8,5"""
 
-        f = StringIO(schema)
+        f = io.StringIO(schema)
         parser = fixed.FixedWidthRowParser(f)
         f.close()
 
diff --git a/tests/test_utilities/test_csvclean.py b/tests/test_utilities/test_csvclean.py
index 0d76698fb..1d284c942 100644
--- a/tests/test_utilities/test_csvclean.py
+++ b/tests/test_utilities/test_csvclean.py
@@ -1,6 +1,6 @@
+import io
 import os
 import sys
-from io import StringIO
 from unittest.mock import patch
 
 from csvkit.utilities.csvclean import CSVClean, launch_new_instance
@@ -17,7 +17,7 @@ def tearDown(self):
 
     def assertCleaned(self, basename, output_lines, error_lines, additional_args=[]):
         args = [f'examples/{basename}.csv'] + additional_args
-        output_file = StringIO()
+        output_file = io.StringIO()
 
         utility = CSVClean(args, output_file)
         utility.run()
diff --git a/tests/test_utilities/test_csvformat.py b/tests/test_utilities/test_csvformat.py
index 70567152b..5e2ebf5e2 100644
--- a/tests/test_utilities/test_csvformat.py
+++ b/tests/test_utilities/test_csvformat.py
@@ -1,5 +1,5 @@
+import io
 import sys
-from io import StringIO
 from unittest.mock import patch
 
 from csvkit.utilities.csvformat import CSVFormat, launch_new_instance
@@ -54,7 +54,7 @@ def test_tab_delimiter(self):
         ])
 
     def test_quotechar(self):
-        input_file = StringIO('a,b,c\n1*2,3,4\n')
+        input_file = io.BytesIO(b'a,b,c\n1*2,3,4\n')
 
         with stdin_as_string(input_file):
             self.assertLines(['-Q', '*'], [
@@ -65,7 +65,7 @@ def test_quotechar(self):
         input_file.close()
 
     def test_doublequote(self):
-        input_file = StringIO('a\n"a ""quoted"" string"')
+        input_file = io.BytesIO(b'a\n"a ""quoted"" string"')
 
         with stdin_as_string(input_file):
             self.assertLines(['-P', '#', '-B'], [
@@ -76,7 +76,7 @@ def test_doublequote(self):
         input_file.close()
 
     def test_escapechar(self):
-        input_file = StringIO('a,b,c\n1"2,3,4\n')
+        input_file = io.BytesIO(b'a,b,c\n1"2,3,4\n')
 
         with stdin_as_string(input_file):
             self.assertLines(['-P', '#', '-U', '3'], [
diff --git a/tests/test_utilities/test_csvjson.py b/tests/test_utilities/test_csvjson.py
index 4bfb4ec4f..302456c5d 100644
--- a/tests/test_utilities/test_csvjson.py
+++ b/tests/test_utilities/test_csvjson.py
@@ -1,6 +1,6 @@
+import io
 import json
 import sys
-from io import StringIO
 from unittest.mock import patch
 
 from csvkit.utilities.csvjson import CSVJSON, launch_new_instance
@@ -58,7 +58,7 @@ def test_keying(self):
         self.assertDictEqual(js, {'True': {'a': True, 'c': 3.0, 'b': 2.0}})
 
     def test_duplicate_keys(self):
-        output_file = StringIO()
+        output_file = io.StringIO()
         utility = CSVJSON(['-k', 'a', 'examples/dummy3.csv'], output_file)
         self.assertRaisesRegex(ValueError,
                                'Value True is not unique in the key column.',
diff --git a/tests/test_utilities/test_csvlook.py b/tests/test_utilities/test_csvlook.py
index f11c4b9f6..d3817b107 100644
--- a/tests/test_utilities/test_csvlook.py
+++ b/tests/test_utilities/test_csvlook.py
@@ -1,5 +1,5 @@
+import io
 import sys
-from io import StringIO
 from unittest.mock import patch
 
 from csvkit.utilities.csvlook import CSVLook, launch_new_instance
@@ -127,7 +127,7 @@ def test_max_column_width(self):
         ])
 
     def test_stdin(self):
-        input_file = StringIO('a,b,c\n1,2,3\n4,5,6\n')
+        input_file = io.BytesIO(b'a,b,c\n1,2,3\n4,5,6\n')
 
         with stdin_as_string(input_file):
             self.assertLines([], [
diff --git a/tests/test_utilities/test_csvsort.py b/tests/test_utilities/test_csvsort.py
index d053d93f3..06cea1ddc 100644
--- a/tests/test_utilities/test_csvsort.py
+++ b/tests/test_utilities/test_csvsort.py
@@ -1,5 +1,5 @@
+import io
 import sys
-from io import StringIO
 from unittest.mock import patch
 
 from csvkit.utilities.csvsort import CSVSort, launch_new_instance
@@ -78,7 +78,7 @@ def test_sort_t_and_nulls(self):
         self.assertEqual(test_order, new_order)
 
     def test_stdin(self):
-        input_file = StringIO('a,b,c\n4,5,6\n1,2,3\n')
+        input_file = io.BytesIO(b'a,b,c\n4,5,6\n1,2,3\n')
 
         with stdin_as_string(input_file):
             self.assertLines([], [
diff --git a/tests/test_utilities/test_csvsql.py b/tests/test_utilities/test_csvsql.py
index 9d49d54e8..4bca6e475 100644
--- a/tests/test_utilities/test_csvsql.py
+++ b/tests/test_utilities/test_csvsql.py
@@ -1,6 +1,6 @@
+import io
 import os
 import sys
-from io import StringIO
 from textwrap import dedent
 from unittest.mock import patch
 
@@ -108,7 +108,7 @@ def test_linenumbers(self):
         '''))  # noqa: W291
 
     def test_stdin(self):
-        input_file = StringIO('a,b,c\n4,2,3\n')
+        input_file = io.BytesIO(b'a,b,c\n4,2,3\n')
 
         with stdin_as_string(input_file):
             sql = self.get_output(['--tables', 'foo'])
@@ -124,7 +124,7 @@ def test_stdin(self):
         input_file.close()
 
     def test_stdin_and_filename(self):
-        input_file = StringIO("a,b,c\n1,2,3\n")
+        input_file = io.BytesIO(b'a,b,c\n1,2,3\n')
 
         with stdin_as_string(input_file):
             sql = self.get_output(['-', 'examples/dummy.csv'])
@@ -135,7 +135,7 @@ def test_stdin_and_filename(self):
         input_file.close()
 
     def test_query(self):
-        input_file = StringIO("a,b,c\n1,2,3\n")
+        input_file = io.BytesIO(b'a,b,c\n1,2,3\n')
 
         with stdin_as_string(input_file):
             sql = self.get_output(['--query', 'SELECT m.usda_id, avg(i.sepal_length) AS mean_sepal_length FROM iris '
@@ -150,7 +150,7 @@ def test_query(self):
         input_file.close()
 
     def test_query_empty(self):
-        input_file = StringIO()
+        input_file = io.BytesIO()
 
         with stdin_as_string(input_file):
             output = self.get_output(['--query', 'SELECT 1'])
@@ -185,14 +185,14 @@ def test_before_after_insert(self):
                          'SELECT 1; CREATE TABLE foobar (date DATE)', '--after-insert',
                          'INSERT INTO dummy VALUES (0, 5, 6)'])
 
-        output_file = StringIO()
+        output_file = io.StringIO()
         utility = SQL2CSV(['--db', 'sqlite:///' + self.db_file, '--query', 'SELECT * FROM foobar'], output_file)
         utility.run()
         output = output_file.getvalue()
         output_file.close()
         self.assertEqual(output, 'date\n')
 
-        output_file = StringIO()
+        output_file = io.StringIO()
         utility = SQL2CSV(['--db', 'sqlite:///' + self.db_file, '--query', 'SELECT * FROM dummy'], output_file)
         utility.run()
         output = output_file.getvalue()
diff --git a/tests/test_utilities/test_csvstack.py b/tests/test_utilities/test_csvstack.py
index a63f2c485..7187971a6 100644
--- a/tests/test_utilities/test_csvstack.py
+++ b/tests/test_utilities/test_csvstack.py
@@ -21,7 +21,7 @@ def test_skip_lines(self):
         ])
 
     def test_skip_lines_stdin(self):
-        with open('examples/test_skip_lines.csv') as f, stdin_as_string(f):
+        with open('examples/test_skip_lines.csv', 'rb') as f, stdin_as_string(f):
             self.assertRows(['--skip-lines', '3', '-', 'examples/test_skip_lines.csv'], [
                 ['a', 'b', 'c'],
                 ['1', '2', '3'],
@@ -62,14 +62,14 @@ def test_multiple_file_stack_col_ragged(self):
         ])
 
     def test_multiple_file_stack_col_ragged_stdin(self):
-        with open('examples/dummy.csv') as f, stdin_as_string(f):
+        with open('examples/dummy.csv', 'rb') as f, stdin_as_string(f):
             self.assertRows(['-', 'examples/dummy_col_shuffled_ragged.csv'], [
                 ['a', 'b', 'c', 'd'],
                 ['1', '2', '3', ''],
                 ['1', '2', '3', '4'],
             ])
 
-        with open('examples/dummy.csv') as f, stdin_as_string(f):
+        with open('examples/dummy.csv', 'rb') as f, stdin_as_string(f):
             self.assertRows(['examples/dummy_col_shuffled_ragged.csv', '-'], [
                 ['b', 'c', 'a', 'd'],
                 ['2', '3', '1', '4'],
@@ -101,14 +101,14 @@ def test_no_header_row_basic(self):
         ])
 
     def test_no_header_row_basic_stdin(self):
-        with open('examples/no_header_row.csv') as f, stdin_as_string(f):
+        with open('examples/no_header_row.csv', 'rb') as f, stdin_as_string(f):
             self.assertRows(['--no-header-row', '-', 'examples/no_header_row2.csv'], [
                 ['a', 'b', 'c'],
                 ['1', '2', '3'],
                 ['4', '5', '6'],
             ])
 
-        with open('examples/no_header_row.csv') as f, stdin_as_string(f):
+        with open('examples/no_header_row.csv', 'rb') as f, stdin_as_string(f):
             self.assertRows(['--no-header-row', 'examples/no_header_row2.csv', '-'], [
                 ['a', 'b', 'c'],
                 ['4', '5', '6'],
diff --git a/tests/test_utilities/test_in2csv.py b/tests/test_utilities/test_in2csv.py
index 49543c484..c78e0a7ed 100644
--- a/tests/test_utilities/test_in2csv.py
+++ b/tests/test_utilities/test_in2csv.py
@@ -1,6 +1,6 @@
+import io
 import os
 import sys
-from io import StringIO
 from unittest.mock import patch
 
 from csvkit.utilities.in2csv import In2CSV, launch_new_instance
@@ -38,7 +38,7 @@ def test_blanks(self):
         self.assertConverted('csv', 'examples/blanks.csv', 'examples/blanks.csv', ['--blanks'])
 
     def test_null_value(self):
-        input_file = StringIO('a,b\nn/a,\\N')
+        input_file = io.BytesIO(b'a,b\nn/a,\\N')
 
         with stdin_as_string(input_file):
             self.assertLines(['-f', 'csv', '--null-value', '\\N'], [
@@ -49,7 +49,7 @@ def test_null_value(self):
         input_file.close()
 
     def test_null_value_blanks(self):
-        input_file = StringIO('a,b\nn/a,\\N')
+        input_file = io.BytesIO(b'a,b\nn/a,\\N')
 
         with stdin_as_string(input_file):
             self.assertLines(['-f', 'csv', '--null-value', '\\N', '--blanks'], [
@@ -153,7 +153,7 @@ def test_csv_no_headers_streaming(self):
                              ['--no-header-row', '--no-inference', '--snifflimit', '0'])
 
     def test_csv_datetime_inference(self):
-        input_file = StringIO('a\n2015-01-01T00:00:00Z')
+        input_file = io.BytesIO(b'a\n2015-01-01T00:00:00Z')
 
         with stdin_as_string(input_file):
             self.assertLines(['-f', 'csv'], [
@@ -182,9 +182,9 @@ def test_xlsx_no_inference(self):
         ])
 
     def test_geojson_no_inference(self):
-        input_file = StringIO(
-            '{"a": 1, "b": 2, "type": "FeatureCollection", "features": [{"geometry": {}, "properties": '
-            '{"a": 1, "b": 2, "c": 3}}]}')
+        input_file = io.BytesIO(
+            b'{"a": 1, "b": 2, "type": "FeatureCollection", "features": [{"geometry": {}, "properties": '
+            b'{"a": 1, "b": 2, "c": 3}}]}')
 
         with stdin_as_string(input_file):
             self.assertLines(['--no-inference', '-f', 'geojson'], [
@@ -195,7 +195,7 @@ def test_geojson_no_inference(self):
         input_file.close()
 
     def test_json_no_inference(self):
-        input_file = StringIO('[{"a": 1, "b": 2, "c": 3}]')
+        input_file = io.BytesIO(b'[{"a": 1, "b": 2, "c": 3}]')
 
         with stdin_as_string(input_file):
             self.assertLines(['--no-inference', '-f', 'json'], [
@@ -206,7 +206,7 @@ def test_json_no_inference(self):
         input_file.close()
 
     def test_ndjson_no_inference(self):
-        input_file = StringIO('{"a": 1, "b": 2, "c": 3}')
+        input_file = io.BytesIO(b'{"a": 1, "b": 2, "c": 3}')
 
         with stdin_as_string(input_file):
             self.assertLines(['--no-inference', '-f', 'ndjson'], [
diff --git a/tests/test_utilities/test_sql2csv.py b/tests/test_utilities/test_sql2csv.py
index 5fb51d192..e5349a2bf 100644
--- a/tests/test_utilities/test_sql2csv.py
+++ b/tests/test_utilities/test_sql2csv.py
@@ -1,6 +1,6 @@
+import io
 import os
 import sys
-from io import StringIO
 from unittest.mock import patch
 
 try:
@@ -71,7 +71,7 @@ def test_file_with_query(self):
         self.assertTrue('54' in csv)
 
     def test_stdin(self):
-        input_file = StringIO('select cast(3.1415 * 13.37 as integer) as answer')
+        input_file = io.BytesIO(b'select cast(3.1415 * 13.37 as integer) as answer')
 
         with stdin_as_string(input_file):
             csv = self.get_output([])
@@ -82,7 +82,7 @@ def test_stdin(self):
         input_file.close()
 
     def test_stdin_with_query(self):
-        input_file = StringIO('select cast(3.1415 * 13.37 as integer) as answer')
+        input_file = io.BytesIO(b'select cast(3.1415 * 13.37 as integer) as answer')
 
         with stdin_as_string(input_file):
             csv = self.get_output(['--query', 'select 6*9 as question'])
@@ -93,7 +93,7 @@ def test_stdin_with_query(self):
         input_file.close()
 
     def test_stdin_with_file(self):
-        input_file = StringIO('select cast(3.1415 * 13.37 as integer) as answer')
+        input_file = io.BytesIO(b'select cast(3.1415 * 13.37 as integer) as answer')
 
         with stdin_as_string(input_file):
             csv = self.get_output(['examples/test.sql'])
@@ -104,7 +104,7 @@ def test_stdin_with_file(self):
         input_file.close()
 
     def test_stdin_with_file_and_query(self):
-        input_file = StringIO('select cast(3.1415 * 13.37 as integer) as answer')
+        input_file = io.BytesIO(b'select cast(3.1415 * 13.37 as integer) as answer')
 
         with stdin_as_string(input_file):
             csv = self.get_output(['examples/test.sql', '--query', 'select 6*9 as question'])
diff --git a/tests/utils.py b/tests/utils.py
index 6b92489b2..876a7e7f7 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -17,11 +17,11 @@
 
 """
 
+import io
 import sys
 import unittest
 import warnings
 from contextlib import contextmanager
-from io import StringIO
 
 import agate
 
@@ -39,7 +39,7 @@ def stderr_as_stdout():
 @contextmanager
 def stdin_as_string(content):
     temp = sys.stdin
-    sys.stdin = content
+    sys.stdin = io.TextIOWrapper(content)
     yield
     sys.stdin = temp
 
@@ -48,7 +48,7 @@ class CSVKitTestCase(unittest.TestCase):
     warnings.filterwarnings(action='ignore', module='agate')
 
     def get_output(self, args):
-        output_file = StringIO()
+        output_file = io.StringIO()
 
         utility = self.Utility(args, output_file)
         utility.run()
@@ -59,7 +59,7 @@ def get_output(self, args):
         return output
 
     def get_output_as_io(self, args):
-        return StringIO(self.get_output(args))
+        return io.StringIO(self.get_output(args))
 
     def get_output_as_list(self, args):
         return self.get_output(args).split('\n')
@@ -89,7 +89,7 @@ def assertLines(self, args, rows, newline_at_eof=True):
 
 class EmptyFileTests:
     def test_empty(self):
-        with open('examples/empty.csv') as f, stdin_as_string(f):
+        with open('examples/empty.csv', 'rb') as f, stdin_as_string(f):
             utility = self.Utility(getattr(self, 'default_args', []))
             utility.run()
 
@@ -105,7 +105,7 @@ def test_names(self):
     def test_invalid_options(self):
         args = ['-n', '--no-header-row', 'examples/dummy.csv']
 
-        output_file = StringIO()
+        output_file = io.StringIO()
         utility = self.Utility(args, output_file)
 
         with self.assertRaises(RequiredHeaderError):
@@ -118,7 +118,7 @@ class ColumnsTests:
     def test_invalid_column(self):
         args = getattr(self, 'columns_args', []) + ['-c', '0', 'examples/dummy.csv']
 
-        output_file = StringIO()
+        output_file = io.StringIO()
         utility = self.Utility(args, output_file)
 
         with self.assertRaises(ColumnIdentifierError):