diff --git a/.gitignore b/.gitignore index 0b5492d..7c384e0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ *.pyc +.coverage .idea/* +.tox/ +TAGS +dist/* doc/build/* doc/html/* -MANIFEST -dist/* +reparse.egg-info/ diff --git a/.landscape.yml b/.landscape.yml new file mode 100644 index 0000000..c276a05 --- /dev/null +++ b/.landscape.yml @@ -0,0 +1,6 @@ +strictness: veryhigh +pep8: + full: true +doc-warnings: false +test-warnings: true +max-line-length: 80 diff --git a/.travis.yml b/.travis.yml index 69534ec..c5f4335 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,11 +2,12 @@ language: python sudo: false python: - 2.7 + - 3.2 - 3.3 - 3.4 + - 3.5 install: - - pip install --use-mirrors pyyaml - - pip install --use-mirrors -r requirements.txt -script: nosetests --with-doctest + - pip install -r requirements-dev.txt +script: nosetests notifications: email: false diff --git a/doc/source/about.rst b/doc/source/about.rst index 5d5972a..79fe21d 100644 --- a/doc/source/about.rst +++ b/doc/source/about.rst @@ -1,7 +1,7 @@ About: Why another tool for parsing? ==================================== -RE|PARSE is simply a tool for combining regular expressions together +Reparse is simply a tool for combining regular expressions together and using a regular expression engine to scan/search/parse/process input for certain tasks. Larger parsing tools like YACC/Bison, ANTLR, and others are really @@ -9,27 +9,27 @@ good for structured input like computer code or xml. They aren't specifically designed for scanning and parsing semi-structured data from unstructured text (like books, or internet documents, or diaries). -RE|PARSE is designed to work with exactly that kind of stuff, (and is completely +Reparse is designed to work with exactly that kind of stuff, (and is completely useless for the kinds of tasks any of the above is often used for). Parsing Spectrum ---------------- -RE|PARSE isn't the first parser of it's kind. A hypothetical spectrum +Reparse isn't the first parser of it's kind. A hypothetical spectrum of parsers from pattern-finding only all the way to highly-featured, structured grammars might look something like this:: - v- RE|PARSE v- YACC/Bison + v- Reparse v- YACC/Bison UNSTRUCTURED |-------------------------| STRUCTURED ^- Regex ^- Parboiled/PyParsing -RE|PARSE is in fact very featureless. It's only a little better +Reparse is in fact very featureless. It's only a little better than plain regular expressions. Still, you might find it ideal for the kinds of tasks it was designed to deal with (like dates and addresses). -What kind of things might RE|PARSE be useful for parsing? ---------------------------------------------------------- +What kind of things might Reparse be useful for parsing? +-------------------------------------------------------- Any kind of semi-structured formats: @@ -41,17 +41,17 @@ Any kind of semi-structured formats: - Addresses - Phone numbers -Or in other words, anything you might consider parsing with Regex, might consider RE|PARSE, +Or in other words, anything you might consider parsing with Regex, might consider Reparse, especially if you are considering combining multiple regular expressions together. Why Regular Expressions --------------------------------- +----------------------- PyParsing (Python) and Parboiled (JVM) also have use-cases very similar -to RE|PARSE, and they are much more feature-filled. They have their own (much more powerful) +to Reparse, and they are much more feature-filled. They have their own (much more powerful) DSL for parsing text. -RE|PARSE uses Regular Expressions which has some advantages: +Reparse uses Regular Expressions which has some advantages: - Short, minimal Syntax - Universal (with some minor differences between different engines) @@ -59,20 +59,20 @@ RE|PARSE uses Regular Expressions which has some advantages: - Moderately Easy-to-learn (Though this is highly subjective) - Many programmers already know the basics - Skills can be carried else where -- **Regular Expressions can be harvested elsewhere and used within RE|PARSE** +- **Regular Expressions can be harvested elsewhere and used within Reparse** - Decent performance over large inputs - Ability to use fuzzy matching regex engines -Limitations of RE|PARSE -------------------------- +Limitations of Reparse +---------------------- Regular Expressions have been known to catch input that was unexpected, or miss input that was expected due to unforeseen edge cases. -RE|PARSE provides tools to help alleviate this by checking the expressions against expected matching +Reparse provides tools to help alleviate this by checking the expressions against expected matching inputs, and against expected non-matching inputs. This library is very limited in what it can parse, if you realize you need something like a recursive grammar, you might want to try PyParsing or something greater -(though RE|PARSE might be helpful as a 'first step' matching and transforming the parse-able data before it is properly +(though Reparse might be helpful as a 'first step' matching and transforming the parse-able data before it is properly parsed by a different library). \ No newline at end of file diff --git a/doc/source/best_practices.rst b/doc/source/best_practices.rst index 2404497..330d82c 100644 --- a/doc/source/best_practices.rst +++ b/doc/source/best_practices.rst @@ -13,7 +13,7 @@ they can have a long productive life without getting out of control: - Never let a regex become too big to be easily understood. Split up big regex into smaller expressions. (Sensible splits won't hurt them). - Maintain a Matches and Non-Matches - - RE|PARSE can use this to test your Regex to make sure they are matching properly + - Reparse can use this to test your Regex to make sure they are matching properly - It helps maintainers see which regular expressions match what quickly - It helps show your intention with each expression, so that others can confidently improve or modify them - Maintain a description which talks about what you are trying to match with each regex, diff --git a/doc/source/howto.rst b/doc/source/howto.rst index 01f6817..756a53f 100644 --- a/doc/source/howto.rst +++ b/doc/source/howto.rst @@ -1,5 +1,5 @@ -Howto: How to use RE|PARSE -========================== +Howto: How to use Reparse +========================= You will need @@ -10,15 +10,15 @@ You will need #. Some example texts that you will want to parse and their solutions. This will be useful to check your parser and will help you put together the expressions and patterns. -1. Setup Python & RE|PARSE --------------------------- +1. Setup Python & Reparse +------------------------- -See :ref:`installation-howto` for instructions on how to install RE|PARSE +See :ref:`installation-howto` for instructions on how to install Reparse -2. Layout of an example RE|PARSE parser -------------------------------------- +2. Layout of an example Reparse parser +-------------------------------------- -RE|PARSE needs 3 things in its operation: +Reparse needs 3 things in its operation: 1. Functions: A dictionary with String Key -> Function Value mapping. @@ -113,7 +113,7 @@ in expressions and merely *combined* in patterns. Order: 2 # I could have used instead to use a pattern inside a pattern but it wouldn't have made a difference really (just an extra function call). -The order field tells RE|PARSE which pattern to pick if multiple patterns match. +The order field tells Reparse which pattern to pick if multiple patterns match. Generally speaking, the more specific patterns should be ordered higher than the lower ones (you wouldn't want someone to try and call a fax machine!). @@ -129,9 +129,9 @@ Done this way, I could have had 3 different formats for Area Code and the patter on any of them. I didn't here because that'd be overkill for phone numbers. 5. Writing your functions.py file ----------------------------------- +--------------------------------- -RE|PARSE matches text and also does some parsing using functions. +Reparse matches text and also does some parsing using functions. The order in which the functions are run and results passed are as follows: @@ -179,7 +179,7 @@ I used namedtuples here, but you can parse your output anyway you want to. 6. Combining it all together! ----------------------------- -The builder.py module contains some functions to build a RE|PARSE system together. +The builder.py module contains some functions to build a Reparse system together. Here's how I'd put together my phone number parser: .. code-block:: python diff --git a/doc/source/modules.rst b/doc/source/modules.rst index ff29990..c06e2d6 100644 --- a/doc/source/modules.rst +++ b/doc/source/modules.rst @@ -1,4 +1,4 @@ -Here lies the embedded docblock documentation for the various parts of RE|PARSE. +Here lies the embedded docblock documentation for the various parts of Reparse. expression ========= diff --git a/examples/colortime/colortime.py b/examples/colortime/colortime.py index f08ba2d..04534f8 100644 --- a/examples/colortime/colortime.py +++ b/examples/colortime/colortime.py @@ -1,16 +1,17 @@ +from __future__ import unicode_literals """ Example from docs: ->>> colortime_parser("~ ~ ~ go to the store ~ buy green at 11pm! ~ ~") +>>> colortime_parser("~ ~ ~ go to the store ~ buy green at 11pm! ~ ~") # doctest: +IGNORE_UNICODE [('green', datetime.time(23, 0))] In this case the processing functions weren't specified but you still get a useful result as a default. ->>> colortime_parser("~ ~ ~ Crazy 2pm green ~ ~") +>>> colortime_parser("~ ~ ~ Crazy 2pm green ~ ~") # doctest: +IGNORE_UNICODE [['green']] """ # Example stuff ----------------------------------------------------- # Have to add the parent directory just in case you -# run this file in the demo directory without installing RE|PARSE +# run this file in the demo directory without installing Reparse import sys sys.path.append('../..') @@ -23,7 +24,7 @@ path += "/" -# RE|PARSE ---------------------------------------------------------- +# Reparse ---------------------------------------------------------- from examples.colortime.functions import functions import reparse diff --git a/examples/colortime/functions.py b/examples/colortime/functions.py index f2c3335..581f7d5 100644 --- a/examples/colortime/functions.py +++ b/examples/colortime/functions.py @@ -18,7 +18,7 @@ def color_time(Color=None, Time=None): return Color, Time # --------------- Function list ------------------ -# This is the dictionary that is used by the RE|PARSE +# This is the dictionary that is used by the Reparse # expression builder. The key is the same value used in the patterns.yaml # file under ``Function: ``. The value is a reference to function. diff --git a/examples/phone/functions.py b/examples/phone/functions.py index 5b55605..08136d1 100644 --- a/examples/phone/functions.py +++ b/examples/phone/functions.py @@ -25,7 +25,7 @@ def fax_phone(p): return p._replace(fax=True) # --------------- Function list ------------------ -# This is the dictionary that is used by the RE|PARSE +# This is the dictionary that is used by the Reparse # expression builder. The key is the same value used in the patterns.yaml # file under ``Function: ``. The value is a reference to function. diff --git a/examples/phone/phone.py b/examples/phone/phone.py index 340e8c7..96509f7 100644 --- a/examples/phone/phone.py +++ b/examples/phone/phone.py @@ -1,12 +1,13 @@ +from __future__ import unicode_literals """ Example of a phone number parser ->>> phone_parser('+974-584-5656') +>>> phone_parser('+974-584-5656') # doctest: +IGNORE_UNICODE [phone(area_code='974', prefix='584', body='5656', fax=False)] ->>> phone_parser('Fax: +974-584-5656') +>>> phone_parser('Fax: +974-584-5656') # doctest: +IGNORE_UNICODE [phone(area_code='974', prefix='584', body='5656', fax=True)] """ # Example stuff ----------------------------------------------------- # Have to add the parent directory just in case you -# run this file in the demo directory without installing RE|PARSE +# run this file in the demo directory without installing Reparse import sys sys.path.append('../..') @@ -19,7 +20,7 @@ path += "/" -# RE|PARSE ---------------------------------------------------------- +# Reparse ---------------------------------------------------------- from examples.phone.functions import functions import reparse diff --git a/examples/readme.rst b/examples/readme.rst index 8fc3cf8..c559b56 100644 --- a/examples/readme.rst +++ b/examples/readme.rst @@ -1,4 +1,4 @@ -These examples shows a very basic RE|PARSE setup to help you get started. +These examples shows a very basic Reparse setup to help you get started. Under each directory there are files like this:: expressions.yaml -- Contains the regular expression building blocks diff --git a/nose.cfg b/nose.cfg new file mode 100644 index 0000000..6e7f10e --- /dev/null +++ b/nose.cfg @@ -0,0 +1,3 @@ +[nosetests] +with-doctest=1 +with-doctest-ignore-unicode=1 diff --git a/readme.rst b/readme.rst index d732c55..4fb51e4 100644 --- a/readme.rst +++ b/readme.rst @@ -1,5 +1,5 @@ -RE|PARSE -======== +Reparse +======= *Python library/tools for combining and parsing using Regular Expressions in a maintainable way* @@ -28,7 +28,7 @@ So you want to get (color and time) or ``[('green', datetime.time(23, 0))]`` out blah blah blah go to the store to buy green at 11pm! blah blah If you need scan/search/parse/transform some unstructured input and get some semi-structured data -out of it RE|PARSE might be able to help. +out of it Reparse might be able to help. First structure some Regular Expressions (Here, in Yaml) -------------------------------------------------------- @@ -105,9 +105,9 @@ Result Cool! -Intrigued? Learn more how to make the magic happen in `Howto: How to use RE|PARSE`_. +Intrigued? Learn more how to make the magic happen in `Howto: How to use Reparse`_. -Want to read more about what RE|PARSE is and what it can do? More info in `About: Why another tool for parsing?`_ +Want to read more about what Reparse is and what it can do? More info in `About: Why another tool for parsing?`_ Info ==== @@ -127,7 +127,7 @@ manually ~~~~~~~~ 1. If you don't have them already, - RE|PARSE depends on REGEX_, and PyYaml_. + Reparse depends on REGEX_, and PyYaml_. Download those and ``python setup.py install`` in their directories. If you are on windows, you may have to find binary installers for these, since they contain modules that have to be compiled. @@ -146,7 +146,7 @@ manually Support ------- -Need some help? Send me an email at asperous2@gmail.com and I'll do my best to help you. +Need some help? Send me an email at theandychase@gmail.com and I'll do my best to help you. Contribution ------------ @@ -157,6 +157,7 @@ Send me suggestions, issues, and pull requests and I'll gladly review them! Versions -------- +- *3.0* InvalidPattern Exception, Allow monkey patching regex arguments. RE|PARSE -> Reparse. - *2.1* Change `yaml.load` to `yaml.safe_load` for security - *2.0* Major Refactor, Python 3, Better Parser builders - *1.1* Fix setup.py @@ -176,7 +177,7 @@ MIT Licensed! See LICENSE file for the full text. .. _Docs at Readthedocs: https://reparse.readthedocs.org/en/latest/ -.. _`Howto: How to use RE|PARSE`: https://reparse.readthedocs.org/en/latest/howto.html +.. _`Howto: How to use Reparse`: https://reparse.readthedocs.org/en/latest/howto.html .. _`About: Why another tool for parsing?`: https://reparse.readthedocs.org/en/latest/about.html diff --git a/reparse/__init__.py b/reparse/__init__.py index 892b046..34f07bd 100644 --- a/reparse/__init__.py +++ b/reparse/__init__.py @@ -1,4 +1,4 @@ -""" RE|PARSE +""" Reparse """ from reparse.parsers import * diff --git a/reparse/builders.py b/reparse/builders.py index fe5dda2..8b6fec6 100644 --- a/reparse/builders.py +++ b/reparse/builders.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals from reparse.config import pattern_max_recursion_depth from reparse.expression import Group, AlternatesGroup, Expression from reparse.util import separate_string @@ -55,7 +56,7 @@ def func(_): def func(_): if any(_): return _ - func.__name__ = name + func.__name__ = str(name) return func def add_function(self, name, function): @@ -76,13 +77,13 @@ class Expression_Builder(object): >>> function_builder.get_function = get_function >>> expression = {'greeting':{'greeting':{'Expression': '(hey)|(cool)', 'Groups' : ['greeting', 'cooly']}}} >>> eb = Expression_Builder(expression, function_builder) - >>> eb.get_type("greeting").findall("hey, cool!") + >>> eb.get_type("greeting").findall("hey, cool!") # doctest: +IGNORE_UNICODE [[('hey',), ('',)], [('',), ('cool',)]] """ def __init__(self, expressions_dict, function_builder): self.type_db = {} - + for expression_type, expressions in expressions_dict.items(): type_expressions = [] for name, expression in expressions.items(): diff --git a/reparse/config.py b/reparse/config.py index 13950cd..3a33f8d 100644 --- a/reparse/config.py +++ b/reparse/config.py @@ -6,5 +6,9 @@ # The regex engine and settings regex_flags = regex.VERBOSE | regex.IGNORECASE -expression_compiler = lambda expression: regex.compile(expression, flags=regex_flags) -expression_sub = lambda expression, sub, string: regex.sub(expression, sub, string, flags=regex_flags) \ No newline at end of file + +def get_expression_compiler(): + return lambda expression: regex.compile(expression, flags=regex_flags) + +def get_expression_sub(): + return lambda expression, sub, string: regex.sub(expression, sub, string, flags=regex_flags) diff --git a/reparse/expression.py b/reparse/expression.py index 6e4ba93..2ef97e2 100644 --- a/reparse/expression.py +++ b/reparse/expression.py @@ -1,5 +1,6 @@ -from reparse.config import expression_compiler -from functools import reduce +from __future__ import unicode_literals +import regex +from reparse.config import get_expression_compiler class Expression(object): @@ -16,6 +17,15 @@ class Expression(object): results from the parsing functions. """ + class InvalidPattern(Exception): + def __init__(self, pattern, regex_error): + super(Expression.InvalidPattern, self).__init__() + self.pattern = pattern + self.regex_error = regex_error + + def __str__(self): + return '%{0.regex_error} in "{0.pattern}" pattern'.format(self) + def __init__(self, regex, functions, group_lengths, final_function, name=""): self.regex = regex self.group_functions = functions @@ -23,18 +33,23 @@ def __init__(self, regex, functions, group_lengths, final_function, name=""): self.final_function = final_function self.name = name self.compiled = False + self.expression_compiler = get_expression_compiler() - def ensure_compiled(self): + @property + def pattern(self): if not self.compiled: - self.compiled = expression_compiler(self.regex) + try: + self.compiled = self.expression_compiler(self.regex) + except regex.error as e: + raise self.InvalidPattern(self.regex, e) + return self.compiled def findall(self, string): """ Parse string, returning all outputs as parsed by functions """ - self.ensure_compiled() output = [] - for match in self.compiled.findall(string): - if isinstance(match, str): + for match in self.pattern.findall(string): + if hasattr(match, 'strip'): match = [match] self._list_add(output, self.run(match)) return output @@ -42,8 +57,7 @@ def findall(self, string): def scan(self, string): """ Like findall, but also returning matching start and end string locations """ - self.ensure_compiled() - return list(self._scanner_to_matches(self.compiled.scanner(string), self.run)) + return list(self._scanner_to_matches(self.pattern.scanner(string), self.run)) def run(self, matches): """ Run group functions over matches @@ -96,7 +110,7 @@ def AlternatesGroup(expressions, final_function, name=""): >>> from collections import namedtuple >>> expr = namedtuple('expr', 'regex group_lengths run')('(1)', [1], None) >>> grouping = AlternatesGroup([expr, expr], lambda f: None, 'yeah') - >>> grouping.regex + >>> grouping.regex # doctest: +IGNORE_UNICODE '(?:(1))|(?:(1))' >>> grouping.group_lengths [1, 1] diff --git a/reparse/parsers.py b/reparse/parsers.py index 40cee8a..d51cea6 100644 --- a/reparse/parsers.py +++ b/reparse/parsers.py @@ -62,7 +62,7 @@ def output(): def parser(parser_type=basic_parser, functions=None, patterns=None, expressions=None, patterns_yaml_path=None, expressions_yaml_path=None): - """ A RE|PARSE parser description. + """ A Reparse parser description. Simply provide the functions, patterns, & expressions to build. If you are using YAML for expressions + patterns, you can use ``expressions_yaml_path`` & ``patterns_yaml_path`` for convenience. @@ -77,9 +77,9 @@ def _load_yaml(file_path): with open(file_path) as f: return yaml.safe_load(f) - assert expressions or expressions_yaml_path, "RE|PARSE can't build a parser without expressions" - assert patterns or patterns_yaml_path, "RE|PARSE can't build a parser without patterns" - assert functions, "RE|PARSE can't build without a functions" + assert expressions or expressions_yaml_path, "Reparse can't build a parser without expressions" + assert patterns or patterns_yaml_path, "Reparse can't build a parser without patterns" + assert functions, "Reparse can't build without a functions" if patterns_yaml_path: patterns = _load_yaml(patterns_yaml_path) diff --git a/reparse/test/__init__.py b/reparse/test/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/reparse/test/expression_test.py b/reparse/test/expression_test.py deleted file mode 100644 index 7fd0836..0000000 --- a/reparse/test/expression_test.py +++ /dev/null @@ -1,42 +0,0 @@ -import unittest -from reparse.expression import Expression, AlternatesGroup - - -class expression_test(unittest.TestCase): - - def test_expressions(self): - def greeting(input): - def func(greeting): - return greeting - return func(*input) - - def final(input): - for i in input: - if i is not None: - return i - - regex = "(hi)|(hi)" - - exp = Expression(regex, [greeting, greeting], [1, 1], final) - - self.assertIsInstance(exp, Expression) - self.assertEquals(exp.findall("hi"), ["hi"]) - - def test_groups(self): - def greeting(input): - def func(greeting): - return greeting - return func(*input) - - def final(input): - for i in input: - if i is not None: - return i - regex = "(hi)" - exp = Expression(regex, [greeting], [1], final) - - regex = "(ho)" - exp2 = Expression(regex, [greeting], [1], final) - - grouped_expressions = AlternatesGroup([exp, exp2], final) - self.assertEquals(grouped_expressions.findall("hi"), ["hi"]) diff --git a/reparse/tools/expression_checker.py b/reparse/tools/expression_checker.py index 4ff3800..a60eea8 100644 --- a/reparse/tools/expression_checker.py +++ b/reparse/tools/expression_checker.py @@ -8,14 +8,15 @@ Example Usage:: - from reparse.expression_tester import expression_tester + from reparse.tools.expression_checker import check_expression import unittest - class cool_test(unittest.Unittest): + class cool_test(unittest.TestCase): def test_coolness(self): - expression_tester(self, load_yaml("parse/cool/expressions.yaml")) + check_expression(self, load_yaml("parse/cool/expressions.yaml")) """ -from reparse.config import expression_sub +from __future__ import unicode_literals +from reparse.config import get_expression_sub base_error_msg = "Expression Type [{}], Group [{}], " match_error_msg = base_error_msg + "Could not match [{}]" non_match_error_msg = base_error_msg + "Should not match [{}]" @@ -32,6 +33,8 @@ def check_expression(testing_framework, expression_dict): >>> check_expression(mock_framework(), ... {'class': {'group' :{'Matches': " 0 | 1", 'Non-Matches': "2 | 0 2", 'Expression': "[0-1]"}}}) """ + expression_sub = get_expression_sub() + for expression_type_name, expression_type in expression_dict.items(): for name, expression_object in expression_type.items(): if 'Matches' in expression_object.keys(): diff --git a/reparse/util.py b/reparse/util.py index 7f36c1b..cbf62c6 100644 --- a/reparse/util.py +++ b/reparse/util.py @@ -37,7 +37,7 @@ def remove_lower_overlapping(current, higher): >>> remove_lower_overlapping([('z', 5, 6)], [('a', 0, 5)]) [('z', 5, 6), ('a', 0, 5)] """ - for i, (match, h_start, h_end) in enumerate(higher): + for (match, h_start, h_end) in higher: overlaps = list(overlapping_at(h_start, h_end, current)) for overlap in overlaps: del current[overlap] diff --git a/reparse/validators.py b/reparse/validators.py index b5887f7..6a6292e 100644 --- a/reparse/validators.py +++ b/reparse/validators.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + # Validators pattern_key_error = "Pattern [{}] does not contain the 'Pattern' key" expression_key_error = "Expression Type [{}] Expression [{}] does not contain the 'Expression' key" diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..48d3614 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,4 @@ +-r requirements.txt +nose +doctest-ignore-unicode +pyyaml \ No newline at end of file diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index 482abe6..622850b --- a/setup.py +++ b/setup.py @@ -8,30 +8,40 @@ def readme_or_docstring(): if os.path.isfile(path): return open(path).read() else: - import reparse + try: + import reparse + return reparse.__doc__ + except ImportError: + return 'Regular Expression based parsers for extracting data from natural language' - return reparse.__doc__ -setup(name='reparse', - version='2.1', - description='Sane Regular Expression based parsers', - long_description=readme_or_docstring(), - author='Andy Chase', - author_email='theandychase@gmail.com', - url='http://github.com/andychase/reparse', - download_url="https://github.com/andychase/reparse/archive/master.zip", - license="MIT", - packages=['reparse'], - install_requires=["regex"], - classifiers=( - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Developers', - 'Natural Language :: English', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.3', - 'Topic :: Software Development :: Libraries :: Python Modules', - 'Topic :: Text Processing' - ), - ) +setup( + name='reparse', + version='3.0', + description='Regular Expression based parsers for extracting data from natural language', + long_description=readme_or_docstring(), + author='Andy Chase', + author_email='theandychase@gmail.com', + url='http://github.com/andychase/reparse', + download_url='https://github.com/andychase/reparse/archive/master.zip', + license='MIT', + packages=['reparse'], + install_requires=[ + 'regex', + 'pyyaml' + ], + classifiers=( + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'Natural Language :: English', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.2', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: Text Processing' + ), +) diff --git a/tests/tests_expression.py b/tests/tests_expression.py new file mode 100644 index 0000000..a04b458 --- /dev/null +++ b/tests/tests_expression.py @@ -0,0 +1,99 @@ +from __future__ import unicode_literals +import regex +from unittest import TestCase +from reparse.expression import AlternatesGroup, Expression +from reparse import config + + +class TestExpression(TestCase): + + def test_raises_useful_exception(self): + """Expression has to raise readable error message.""" + exp = Expression(r'inalid (\d]', {}, [], lambda x: x) + with self.assertRaises(exp.InvalidPattern): + assert not exp.pattern + + def test_expressions(self): + def greeting(input): + def func(greeting): + return greeting + return func(*input) + + def final(input): + for i in input: + if i is not None: + return i + + regex = "(hi)|(hi)" + + exp = Expression(regex, [greeting, greeting], [1, 1], final) + + self.assertIsInstance(exp, Expression) + self.assertEquals(exp.findall("hi"), ["hi"]) + + def test_groups(self): + def greeting(input): + def func(greeting): + return greeting + return func(*input) + + def final(input): + for i in input: + if i is not None: + return i + regex = "(hi)" + exp = Expression(regex, [greeting], [1], final) + + regex = "(ho)" + exp2 = Expression(regex, [greeting], [1], final) + + grouped_expressions = AlternatesGroup([exp, exp2], final) + self.assertEquals(grouped_expressions.findall("hi"), ["hi"]) + + +class TestCustomFlags(TestCase): + @classmethod + def setUpClass(cls): + cls._regex_flags = config.regex_flags + + @classmethod + def tearDownClass(cls): + config.regex_flags = cls._regex_flags + + def test_unicode_flag(self): + def uni_match(input): + def func(uni_match): + return uni_match + return func(*input) + + def final(input): + return input + + config.regex_flags = config.regex_flags | regex.UNICODE + + imp_regex = "(\w+)" + implicit_u = Expression(imp_regex, [uni_match], [1], final) + self.assertEquals(implicit_u.findall("b\xebs"), ["b\xebs"]) + + exp_regex = "([\u00c0-\ud7ff]+)" + explicit_u = Expression(exp_regex, [uni_match], [1], final) + self.assertEquals(explicit_u.findall("b\u00eb\u2013s"), ["\u00eb\u2013"]) + + def test_case_sensitivity(self): + def uni_match(input): + def func(uni_match): + return uni_match + return func(*input) + + def final(input): + return input + + config.regex_flags = regex.VERBOSE + + lower_regex = "([a-z]+)" + lower_exp = Expression(lower_regex, [uni_match], [1], final) + self.assertEquals(lower_exp.findall("aAbcBC"), ["a", "bc"]) + + upper_regex = "([A-Z]+)" + upper_exp = Expression(upper_regex, [uni_match], [1], final) + self.assertEquals(upper_exp.findall("aAbcBC"), ["A", "BC"]) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..a76d2df --- /dev/null +++ b/tox.ini @@ -0,0 +1,14 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +envlist = py27, py35 + +[testenv] +commands = nosetests --verbose --with-doctest --with-coverage --cover-package=reparse +deps = + nose + py{27,33,34,35}: coverage + py32: coverage==3.7.1