From 1cc5e289d289da6b350e8a1764b89eb6575625a7 Mon Sep 17 00:00:00 2001 From: Florian Strzelecki Date: Sun, 13 Jan 2019 23:17:04 +0100 Subject: [PATCH 1/9] core: 2-bytes unicode characters are not truncated anymore To send a long message, Sopel needs to split the message in multiple lines of text, and used to split the bytestring version of the unicode text. This would cause an issue when 2-bytes unicode characters get truncated in half, displaying unwanted message. In this commit, I split the unicode string properly, and I truncate it until its encoded version is below the max length. See also the test/test_tools.py file for all the use-case handled by this change. --- sopel/bot.py | 28 ++++--------- sopel/tools/__init__.py | 34 +++++++++++++++ test/test_tools.py | 93 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+), 20 deletions(-) create mode 100644 test/test_tools.py diff --git a/sopel/bot.py b/sopel/bot.py index 28be3539b3..7b2b09801d 100644 --- a/sopel/bot.py +++ b/sopel/bot.py @@ -290,27 +290,15 @@ def say(self, text, recipient, max_messages=1): message will contain the entire remainder, which may be truncated by the server. """ - # We're arbitrarily saying that the max is 400 bytes of text when - # messages will be split. Otherwise, we'd have to acocunt for the bot's - # hostmask, which is hard. - max_text_length = 400 - # Encode to bytes, for propper length calculation - if isinstance(text, unicode): - encoded_text = text.encode('utf-8') - else: - encoded_text = text excess = '' - if max_messages > 1 and len(encoded_text) > max_text_length: - last_space = encoded_text.rfind(' '.encode('utf-8'), 0, max_text_length) - if last_space == -1: - excess = encoded_text[max_text_length:] - encoded_text = encoded_text[:max_text_length] - else: - excess = encoded_text[last_space + 1:] - encoded_text = encoded_text[:last_space] - # We'll then send the excess at the end - # Back to unicode again, so we don't screw things up later. - text = encoded_text.decode('utf-8') + if not isinstance(text, unicode): + # Make sure we are dealing with unicode string + text = text.decode('utf-8') + + if max_messages > 1: + # Manage multi-line only when needed + text, excess = tools.get_sendable_message(text) + try: self.sending.acquire() diff --git a/sopel/tools/__init__.py b/sopel/tools/__init__.py index b210f71c94..738a46af84 100644 --- a/sopel/tools/__init__.py +++ b/sopel/tools/__init__.py @@ -152,6 +152,40 @@ def get_nickname_command_pattern(command): """.format(command=command) +def get_sendable_message(text, max_length=400): + """Get a sendable ``text`` message, with its excess when needed. + + :param str txt: unicode string of text to send + :param int max_length: maximum length of the message to be sendable + :return: a tuple of two values, the sendable text and its excess text + + We're arbitrarily saying that the max is 400 bytes of text when + messages will be split. Otherwise, we'd have to account for the bot's + hostmask, which is hard. + + The `max_length` is the max length of text in **bytes**, but we take + care of unicode 2-bytes characters, by working on the unicode string, + then making sure the bytes version is smaller than the max length. + """ + unicode_max_length = max_length + excess = '' + + while len(text.encode('utf-8')) > max_length: + last_space = text.rfind(' ', 0, unicode_max_length) + if last_space == -1: + # No last space, just split where it is possible + excess = text[unicode_max_length:] + excess + text = text[:unicode_max_length] + # Decrease max length for the unicode string + unicode_max_length = unicode_max_length - 1 + else: + # Split at the last best space found + excess = text[last_space:] + text = text[:last_space] + + return text, excess.lstrip() + + def deprecated(old): def new(*args, **kwargs): print('Function %s is deprecated.' % old.__name__, file=sys.stderr) diff --git a/test/test_tools.py b/test/test_tools.py new file mode 100644 index 0000000000..c4289b738a --- /dev/null +++ b/test/test_tools.py @@ -0,0 +1,93 @@ +# coding=utf-8 +"""Tests sopel.tools""" +from __future__ import unicode_literals, absolute_import, print_function, division + + +from sopel import tools + + +def test_get_sendable_message_default(): + initial = 'aaaa' + text, excess = tools.get_sendable_message(initial) + + assert text == initial + assert excess == '' + + +def test_get_sendable_message_limit(): + initial = 'a' * 400 + text, excess = tools.get_sendable_message(initial) + + assert text == initial + assert excess == '' + + +def test_get_sendable_message_excess(): + initial = 'a' * 401 + text, excess = tools.get_sendable_message(initial) + + assert text == 'a' * 400 + assert excess == 'a' + + +def test_get_sendable_message_excess_space(): + # aaa...aaa bbb...bbb + initial = ' '.join(['a' * 200, 'b' * 200]) + text, excess = tools.get_sendable_message(initial) + + assert text == 'a' * 200 + assert excess == 'b' * 200 + + +def test_get_sendable_message_excess_space_limit(): + # aaa...aaa bbb...bbb + initial = ' '.join(['a' * 400, 'b' * 200]) + text, excess = tools.get_sendable_message(initial) + + assert text == 'a' * 400 + assert excess == 'b' * 200 + + +def test_get_sendable_message_excess_bigger(): + # aaa...aaa bbb...bbb + initial = ' '.join(['a' * 401, 'b' * 1000]) + text, excess = tools.get_sendable_message(initial) + + assert text == 'a' * 400 + assert excess == 'a ' + 'b' * 1000 + + +def test_get_sendable_message_optional(): + text, excess = tools.get_sendable_message('aaaa', 3) + assert text == 'aaa' + assert excess == 'a' + + text, excess = tools.get_sendable_message('aaa bbb', 3) + assert text == 'aaa' + assert excess == 'bbb' + + text, excess = tools.get_sendable_message('aa bb cc', 3) + assert text == 'aa' + assert excess == 'bb cc' + + +def test_get_sendable_message_two_bytes(): + text, excess = tools.get_sendable_message('αααα', 4) + assert text == 'αα' + assert excess == 'αα' + + text, excess = tools.get_sendable_message('αααα', 5) + assert text == 'αα' + assert excess == 'αα' + + text, excess = tools.get_sendable_message('α ααα', 4) + assert text == 'α' + assert excess == 'ααα' + + text, excess = tools.get_sendable_message('αα αα', 4) + assert text == 'αα' + assert excess == 'αα' + + text, excess = tools.get_sendable_message('ααα α', 4) + assert text == 'αα' + assert excess == 'α α' From ac5bd8b11ee72c3959da2b43253d712380c099e9 Mon Sep 17 00:00:00 2001 From: Florian Strzelecki Date: Sun, 13 Jan 2019 23:49:12 +0100 Subject: [PATCH 2/9] core: irc.bot write method truncate properly unicode message --- sopel/irc.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/sopel/irc.py b/sopel/irc.py index 7b259ab08d..aa0597bcae 100644 --- a/sopel/irc.py +++ b/sopel/irc.py @@ -138,13 +138,24 @@ def write(self, args, text=None): # CR-LF (Carriage Return - Line Feed) pair, and these messages SHALL # NOT exceed 512 characters in length, counting all characters # including the trailing CR-LF. Thus, there are 510 characters - # maximum allowed for the command and its parameters. There is no + # maximum allowed for the command and its parameters. There is no # provision for continuation of message lines. + max_length = unicode_max_length = 510 if text is not None: - temp = (' '.join(args) + ' :' + text)[:510] + '\r\n' + temp = (' '.join(args) + ' :' + text) else: - temp = ' '.join(args)[:510] + '\r\n' + temp = ' '.join(args) + + # The max length of 512 is in bytes, not unicode + while len(temp.encode('utf-8')) > max_length: + temp = temp[:unicode_max_length] + unicode_max_length = unicode_max_length - 1 + + # Ends the message with CR-LF + temp = temp + '\r\n' + + # Log and output the message self.log_raw(temp, '>>') self.send(temp.encode('utf-8')) finally: From 9d83b54aa151780643334dc974c829eb3df9bb4b Mon Sep 17 00:00:00 2001 From: Humorous Baby <44451911+HumorBaby@users.noreply.github.com> Date: Tue, 29 Jan 2019 15:25:15 -0500 Subject: [PATCH 3/9] cli: fix configpath issue for wizard Now checks if the config file passed to `-c` already has a `.cfg` extensions before appending one. Fixes #1463 --- sopel/config/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sopel/config/__init__.py b/sopel/config/__init__.py index 473fff3521..ea72ef81e8 100644 --- a/sopel/config/__init__.py +++ b/sopel/config/__init__.py @@ -212,7 +212,7 @@ def _modules(self): def _wizard(section, config=None): dotdir = os.path.expanduser('~/.sopel') - configpath = os.path.join(dotdir, (config or 'default') + '.cfg') + configpath = os.path.join(dotdir, ((config or 'default.cfg') + ('.cfg' if config and not config.endswith('.cfg') else ''))) if section == 'all': _create_config(configpath) elif section == 'mod': From 163a74f2c1bdd920c5df07fe2e34824388725ec7 Mon Sep 17 00:00:00 2001 From: dgw Date: Tue, 29 Jan 2019 16:34:38 -0600 Subject: [PATCH 4/9] travis: allow building maintenance branches Maintenance branches will always look like Major.Minor.X, so it seemed easier to give them a separate branch rule from release tags. --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index d49bfd81ed..7a8b80d28e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,7 @@ git: branches: only: - master + - /^\d+\.\d+\.x$/ # allows building maintenance branches - /^v?\d+\.\d+(\.\d+)?(-\S*)?$/ # allows building version tags sudo: false # Enables running on faster infrastructure. cache: From 4e8f28d13aa3772eabeed06ad92e4788921ac241 Mon Sep 17 00:00:00 2001 From: Florian Strzelecki Date: Wed, 23 Jan 2019 20:13:07 +0100 Subject: [PATCH 5/9] coretasks: split AUTHENTICATE token in 400-byte chunks (fix #975) --- sopel/coretasks.py | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/sopel/coretasks.py b/sopel/coretasks.py index 0098fedd1c..b2c8f8adf2 100644 --- a/sopel/coretasks.py +++ b/sopel/coretasks.py @@ -539,6 +539,37 @@ def recieve_cap_ack_sasl(bot): bot.write(('AUTHENTICATE', mech)) +def send_authenticate(bot, token): + """Send ``AUTHENTICATE`` command to server with the given ``token``. + + :param bot: instance of IRC bot that must authenticate + :param str token: authentication token + + In case the ``token`` is more than 400 bytes, we need to split it and send + as many ``AUTHENTICATE`` commands as needed. If the last chunk is 400 bytes + long, we must also send a last empty command (`AUTHENTICATE +` is for empty + line), so the server knows we are done with ``AUTHENTICATE``. + + .. seealso:: + + https://ircv3.net/specs/extensions/sasl-3.1.html#the-authenticate-command + + """ + # payload is a base64 encoded token + payload = base64.b64encode(token.encode('utf-8')) + + # split the payload into chunks of at most 400 bytes + chunk_size = 400 + for i in range(0, len(payload), chunk_size): + offset = i + chunk_size + chunk = payload[i:offset] + bot.write(('AUTHENTICATE', chunk)) + + # send empty (+) AUTHENTICATE when payload's length is a multiple of 400 + if len(payload) % chunk_size == 0: + bot.write(('AUTHENTICATE', '+')) + + @sopel.module.event('AUTHENTICATE') @sopel.module.rule('.*') def auth_proceed(bot, trigger): @@ -549,8 +580,7 @@ def auth_proceed(bot, trigger): sasl_username = bot.config.core.auth_username or bot.nick sasl_password = bot.config.core.auth_password sasl_token = '\0'.join((sasl_username, sasl_username, sasl_password)) - # Spec says we do a base 64 encode on the SASL stuff - bot.write(('AUTHENTICATE', base64.b64encode(sasl_token.encode('utf-8')))) + send_authenticate(bot, sasl_token) @sopel.module.event(events.RPL_SASLSUCCESS) From ee9fe88ac8b4bddca3edca3603f1b8562403d37a Mon Sep 17 00:00:00 2001 From: Rusty Bower Date: Wed, 9 Jan 2019 09:50:29 -0600 Subject: [PATCH 6/9] wiktionary: fix query logix (see #1214) wiktionary: fixing .lower() logic --- sopel/modules/wiktionary.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sopel/modules/wiktionary.py b/sopel/modules/wiktionary.py index 0b7c4d81c5..2f33844665 100644 --- a/sopel/modules/wiktionary.py +++ b/sopel/modules/wiktionary.py @@ -99,8 +99,11 @@ def wiktionary(bot, trigger): _etymology, definitions = wikt(word) if not definitions: - bot.say("Couldn't get any definitions for %s." % word) - return + # Cast word to lower to check in case of mismatched user input + _etymology, definitions = wikt(word.lower()) + if not definitions: + bot.say("Couldn't get any definitions for %s." % word) + return result = format(word, definitions) if len(result) < 150: From 6bd094e8af26e158f324f1c8720325f85f854145 Mon Sep 17 00:00:00 2001 From: dgw Date: Fri, 1 Feb 2019 02:33:43 -0600 Subject: [PATCH 7/9] split the hair again on supported IPython versions I can't WAIT to drop the ipython module from core. Less than a year after dropping py2 support, upstream has also dumped support for Python below 3.5. They're moving too fast for our slow-ass release cycle (and it's not even an important dependency). --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2494b29236..88c8349cd4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ praw<6.0.0 pyenchant; python_version < '3.7' geoip2 ipython<6.0; python_version < '3.3' -ipython>=6.0,<7.0; python_version >= '3.3' +ipython>=6.0,<7.0; python_version >= '3.3' and python_version < '3.5' +ipython>=7.0,<8.0; python_version >= '3.5' requests>=2.0.0,<3.0.0 dnspython From 4ac49457ae1e7b8c2125853430a1dd859c47f4f2 Mon Sep 17 00:00:00 2001 From: dgw Date: Fri, 1 Feb 2019 02:52:33 -0600 Subject: [PATCH 8/9] Get specific about dnspython requirement dnspython dropped support for Python 3.3 in version 1.16.0, so we have to special-case that. Otherwise, py2.7 and 3.4+ are supported until version 2.0 comes out. This is the part where I maybe come to regret adding this dependency, isn't it? Might be worth trying to make dnspython optional later. --- requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2494b29236..b1dd623b1d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,6 @@ geoip2 ipython<6.0; python_version < '3.3' ipython>=6.0,<7.0; python_version >= '3.3' requests>=2.0.0,<3.0.0 -dnspython +dnspython<2.0; python_version >= '2.7' and python_version < '3.0' +dnspython<1.16.0; python_version == '3.3' +dnspython<3.0; python_version >= '3.4' From 90a091614f7428fc6f419f5a50810850446b36f6 Mon Sep 17 00:00:00 2001 From: dgw Date: Fri, 1 Feb 2019 03:08:31 -0600 Subject: [PATCH 9/9] Release 6.6.2 --- NEWS | 13 +++++++++++++ sopel/__init__.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index db7aebdfa2..94635c0689 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,16 @@ +Changes between 6.6.1 and 6.6.2 +=============================== +Module changes: +* wiktionary tries harder to get a valid result before erroring out + +Core changes: +* Fixed an inconsistency between interpretations of the --config option in + normal operation vs. wizard mode +* Requirement specifiers tightened up to reduce/prevent pip trying to install + incompatible dependency versions (IPython, dnspython) +* SASL token is now split when required according to spec +* Multi-byte Unicode characters are now handled correctly when splitting lines + Changes between 6.6.0 and 6.6.1 =============================== Module changes: diff --git a/sopel/__init__.py b/sopel/__init__.py index bdd8ba6ed2..f796233139 100644 --- a/sopel/__init__.py +++ b/sopel/__init__.py @@ -30,7 +30,7 @@ import traceback import signal -__version__ = '6.6.1' +__version__ = '6.6.2' def _version_info(version=__version__):