From bfbeba5f4c3c28637cddf78c6c7e712ecdd07efc Mon Sep 17 00:00:00 2001 From: "Jesus M. Gonzalez-Barahona" Date: Fri, 18 Mar 2016 23:23:57 +0100 Subject: [PATCH] [git] Use "backslashreplace" instead of "surrogateescape". When decoding as utf8, if the character cannnot be decoded, use the backslashreplace error handler, instead of the surrogateescape error handler. Fixes #18 for git backend, maybe others should be fixed too. --- perceval/backends/git.py | 8 ++++---- tests/data/git_bad_utf8.txt | 10 ++++++++++ tests/test_git.py | 18 +++++++++++++++++- 3 files changed, 31 insertions(+), 5 deletions(-) create mode 100644 tests/data/git_bad_utf8.txt diff --git a/perceval/backends/git.py b/perceval/backends/git.py index c5ee9d4de..80ea4a1c7 100644 --- a/perceval/backends/git.py +++ b/perceval/backends/git.py @@ -154,7 +154,7 @@ def parse_git_log_from_file(filepath): :raises OSError: raised when an error occurs reading the given file """ - with open(filepath, 'r', errors='surrogateescape') as f: + with open(filepath, 'r', errors='backslashreplace') as f: parser = GitParser(f) for commit in parser.parse(): @@ -648,7 +648,7 @@ def log(self, from_date=None, encoding='utf-8'): self.uri, self.dirpath) for line in gitlog: - line = line.decode(encoding, errors='surrogateescape') + line = line.decode(encoding, errors='backslashreplace') yield line @staticmethod @@ -675,10 +675,10 @@ def _exec(cmd, cwd=None, env=None): raise RepositoryError(cause=str(e)) if proc.returncode != 0: - err = errs.decode('utf-8', errors='surrogateescape') + err = errs.decode('utf-8', errors='backslashreplace') cause = "git command - %s" % err raise RepositoryError(cause=cause) else: - logging.debug(errs.decode('utf-8', errors='surrogateescape')) + logging.debug(errs.decode('utf-8', errors='backslashreplace')) return outs diff --git a/tests/data/git_bad_utf8.txt b/tests/data/git_bad_utf8.txt new file mode 100644 index 000000000..8f4db5ee4 --- /dev/null +++ b/tests/data/git_bad_utf8.txt @@ -0,0 +1,10 @@ +commit c4c8ea948aa21527d502e87227b2f1d951bc506d d69332b875efb52ea5276d5638ce572fcd7375f2 +Author: Jason Gaston +AuthorDate: Sat Apr 16 15:24:43 2005 -0700 +Commit: Linus Torvalds +CommitDate: Sat Apr 16 15:24:43 2005 -0700 + + [PATCH] intel8x0: AC'97 audio patch for Intel ESB2 + + Signed-off-by:  Jason Gaston + diff --git a/tests/test_git.py b/tests/test_git.py index 975a8cfeb..65885f118 100644 --- a/tests/test_git.py +++ b/tests/test_git.py @@ -202,7 +202,23 @@ def test_git_encoding_error(self): commit = result[0] self.assertEqual(commit['commit'], 'cb24e4f2f7b2a7f3450bfb15d1cbaa97371e93fb') - self.assertEqual(commit['message'], 'Calling \udc93Open Type\udc94 (CTRL+SHIFT+T) after startup - performance improvement.') + self.assertEqual(commit['message'], 'Calling \\x93Open Type\\x94 (CTRL+SHIFT+T) after startup - performance improvement.') + + def test_git_utf8_error(self): + """Characters that cannot decoded as utf8 can be later encoded as utf8. + + This test raised the following exception before being fixed: + "UnicodeEncodeError: 'utf-8' codec can't encode character '\udca0' + in position 153: surrogates not allowed" + + """ + + message_ok = b"[PATCH] intel8x0: AC'97 audio patch for Intel ESB2\n" \ + + b"\nSigned-off-by: \\xa0Jason Gaston " + + commits = Git.parse_git_log_from_file("data/git_bad_utf8.txt") + commit = [commit for commit in commits][0] + self.assertEqual(commit['message'].encode('utf8'), message_ok) def test_git_parser_from_iter(self): """Test if the static method parses a git log from a repository"""