fixes pylint errors and improves code quality (#146)

* uses pycodestyle since pep8 is depreciated * fixes some pylint errors * improves code quality
quantling · Mar 1, 2018 · 9b47606 · 9b47606
1 parent 27bb8c6
commit 9b47606
Show file tree

Hide file tree

Showing 13 changed files with 144 additions and 158 deletions.
diff --git a/pyndl/__init__.py b/pyndl/__init__.py
@@ -61,10 +61,10 @@ def sysinfo():
               "CPU: {cpu_count}\n").format(s=uname, cpu_count=mp.cpu_count())
 
     if uname.sysname == "Linux":
-        names, *lines = os.popen("free -m").readlines()
+        _, *lines = os.popen("free -m").readlines()
         for identifier in ["Mem:", "Swap:"]:
             memory = [line for line in lines if identifier in line][0]
-            ix, total, used, *rest = memory.split()
+            _, total, used, *_ = memory.split()
             osinfo += "{} {}MiB/{}MiB\n".format(identifier, used, total)
 
     osinfo += "\n"

diff --git a/pyndl/activation.py b/pyndl/activation.py
@@ -16,6 +16,7 @@
 from . import ndl
 
 
+# pylint: disable=W0621
 def activation(events, weights, number_of_threads=1, remove_duplicates=None, ignore_missing_cues=False):
     """
     Estimate activations for given events in event file and outcome-cue weights.
@@ -60,18 +61,17 @@ def activation(events, weights, number_of_threads=1, remove_duplicates=None, ign
     if isinstance(events, str):
         events = ndl.events_from_file(events)
 
-    event_cues_list = (cues for cues, outcomes in events)
+    events = (cues for cues, outcomes in events)
     if remove_duplicates is None:
-        def enforce_no_duplicates(cues):
+        def check_no_duplicates(cues):
             if len(cues) != len(set(cues)):
-                raise ValueError('cues needs to be unique: "%s"; use '
-                                 'remove_duplicates=True' %
-                                 (' '.join(cues)))
+                raise ValueError('cues needs to be unique: "{}"; use '
+                                 'remove_duplicates=True'.format(' '.join(cues)))
             else:
                 return set(cues)
-        event_cues_list = (enforce_no_duplicates(cues) for cues in event_cues_list)
+        events = (check_no_duplicates(cues) for cues in events)
     elif remove_duplicates is True:
-        event_cues_list = (set(cues) for cues in event_cues_list)
+        events = (set(cues) for cues in events)
 
     if isinstance(weights, xr.DataArray):
         cues = weights.coords["cues"].values.tolist()
@@ -81,23 +81,25 @@ def enforce_no_duplicates(cues):
         cue_map = OrderedDict(((cue, ii) for ii, cue in enumerate(cues)))
         if ignore_missing_cues:
             event_cue_indices_list = (tuple(cue_map[cue] for cue in event_cues if cue in cues)
-                                      for event_cues in event_cues_list)
+                                      for event_cues in events)
         else:
             event_cue_indices_list = (tuple(cue_map[cue] for cue in event_cues)
-                                      for event_cues in event_cues_list)
-        activations = _activation_matrix(list(event_cue_indices_list), weights.values, number_of_threads)
+                                      for event_cues in events)
+        # pylint: disable=W0621
+        activations = _activation_matrix(list(event_cue_indices_list),
+                                         weights.values, number_of_threads)
         return xr.DataArray(activations,
                             coords={
                                 'outcomes': outcomes
                             },
                             dims=('outcomes', 'events'))
     elif isinstance(weights, dict):
         assert number_of_threads == 1, "Estimating activations with multiprocessing is not implemented for dicts."
-        activations = defaultdict(lambda: np.zeros(len(event_cues_list)))
-        event_cues_list = list(event_cues_list)
+        activations = defaultdict(lambda: np.zeros(len(events)))
+        events = list(events)
         for outcome, cue_dict in weights.items():
             _activations = activations[outcome]
-            for row, cues in enumerate(event_cues_list):
+            for row, cues in enumerate(events):
                 for cue in cues:
                     _activations[row] += cue_dict[cue]
         return activations
@@ -111,6 +113,7 @@ def _init_mp_activation_matrix(weights_, weights_shape_, activations_, activatio
     Initializes shared variables weights and activations.
 
     """
+    # pylint: disable=C0103, W0621, W0601
     global weights, activations
     weights = np.ctypeslib.as_array(weights_)
     weights.shape = weights_shape_

diff --git a/pyndl/corpus.py b/pyndl/corpus.py
@@ -100,6 +100,8 @@ def read_clean_gzfile(gz_file_path, *, break_duration=2.0):
 
 
 class JobParseGz():
+    # pylint: disable=E0202,missing-docstring
+
     """
     Stores the persistent information over several jobs and exposes a job
     method that only takes the varying parts as one argument.

diff --git a/pyndl/count.py b/pyndl/count.py
@@ -14,7 +14,6 @@
 import gzip
 import itertools
 import multiprocessing
-import os
 import sys
 
 

diff --git a/pyndl/ndl.py b/pyndl/ndl.py
@@ -203,7 +203,7 @@ def worker():
                 for partlist in part_lists:
                     working_queue.put(np.array(partlist, dtype=np.uint32))
 
-            for thread_id in range(number_of_threads):
+            for _ in range(number_of_threads):
                 thread = threading.Thread(target=worker)
                 thread.start()
                 threads.append(thread)
@@ -248,7 +248,7 @@ def _attributes(event_path, number_events, alpha, betas, lambda_, cpu_time,
     def _format(value):
         return '{0: <{width}}'.format(value, width=width)
 
-    if not type(alpha) in (float, int):
+    if not isinstance(alpha, (float, int)):
         alpha = 'varying'
 
     new_attrs = {'date': _format(time.strftime("%Y-%m-%d %H:%M:%S")),
@@ -278,7 +278,7 @@ def _format(value):
             if key in new_attrs:
                 new_val = new_attrs[key]
             else:
-                new_val = format_('')
+                new_val = ''
             new_attrs[key] = old_val + ' | ' + new_val
     return new_attrs
 
@@ -295,9 +295,12 @@ class WeightDict(defaultdict):
 
     """
 
+    # pylint: disable=W0613
     def __init__(self, *args, **kwargs):
         super().__init__(lambda: defaultdict(float))
 
+        self._attrs = OrderedDict()
+
         if 'attrs' in kwargs:
             self.attrs = kwargs['attrs']
         else:
@@ -391,9 +394,9 @@ def dict_ndl(events, alphas, betas, lambda_=1.0, *,
         attrs_to_update = weights_ini.attrs
         coords = weights_ini.coords
         weights = WeightDict()
-        for oi, outcome in enumerate(coords['outcomes'].values):
-            for ci, cue in enumerate(coords['cues'].values):
-                weights[outcome][cue] = weights_ini.item((oi, ci))
+        for outcome_index, outcome in enumerate(coords['outcomes'].values):
+            for cue_index, cue in enumerate(coords['cues'].values):
+                weights[outcome][cue] = weights_ini.item((outcome_index, cue_index))
     elif not isinstance(weights, defaultdict):
         raise ValueError('weights needs to be either defaultdict or None')
 

diff --git a/pyndl/preprocess.py b/pyndl/preprocess.py
@@ -73,6 +73,38 @@ def bandsample(population, sample_size=50000, *, cutoff=5, seed=None,
     return sample
 
 
+def ngrams_to_word(occurrences, n_chars, outfile, remove_duplicates=True):
+    """
+    Process the occurrences and write them to outfile.
+
+    Parameters
+    ----------
+    occurrences : sequence of (cues, outcomes) tuples
+        cues and outcomes are both strings where underscores and # are
+        special symbols.
+    n_chars : number of characters (e.g. 2 for bigrams, 3 for trigrams, ...)
+    outfile : file handle
+
+    remove_duplicates : bool
+        if True make cues and outcomes per event unique
+
+    """
+    for cues, outcomes in occurrences:
+        if cues and outcomes:
+            occurrence = cues + '_' + outcomes
+        else:  # take either
+            occurrence = cues + outcomes
+        phrase_string = "#" + re.sub("_", "#", occurrence) + "#"
+        ngrams = (phrase_string[i:(i + n_chars)] for i in
+                  range(len(phrase_string) - n_chars + 1))
+        if not ngrams or not occurrence:
+            continue
+        if remove_duplicates:
+            outfile.write("{}\t{}\n".format("_".join(set(ngrams)), occurrence))
+        else:
+            outfile.write("{}\t{}\n".format("_".join(ngrams), occurrence))
+
+
 def process_occurrences(occurrences, outfile, *,
                         cue_structure="trigrams_to_word", remove_duplicates=True):
     """
@@ -92,43 +124,17 @@ def process_occurrences(occurrences, outfile, *,
 
     """
     if cue_structure == "bigrams_to_word":
-        for cues, outcomes in occurrences:
-            if cues and outcomes:
-                occurrence = cues + '_' + outcomes
-            else:  # take either
-                occurrence = cues + outcomes
-            phrase_string = "#" + re.sub("_", "#", occurrence) + "#"
-            bigrams = (phrase_string[i:(i + 2)] for i in
-                       range(len(phrase_string) - 2 + 1))
-            if not bigrams or not occurrence:
-                continue
-            if remove_duplicates:
-                outfile.write("_".join(set(bigrams)) + "\t" + occurrence + "\n")
-            else:
-                outfile.write("_".join(bigrams) + "\t" + occurrence + "\n")
+        ngrams_to_word(occurrences, 2, outfile, remove_duplicates=remove_duplicates)
     elif cue_structure == "trigrams_to_word":
-        for cues, outcomes in occurrences:
-            if cues and outcomes:
-                occurrence = cues + '_' + outcomes
-            else:  # take either
-                occurrence = cues + outcomes
-            phrase_string = "#" + re.sub("_", "#", occurrence) + "#"
-            trigrams = (phrase_string[i:(i + 3)] for i in
-                        range(len(phrase_string) - 3 + 1))
-            if not trigrams or not occurrence:
-                continue
-            if remove_duplicates:
-                outfile.write("_".join(set(trigrams)) + "\t" + occurrence + "\n")
-            else:
-                outfile.write("_".join(trigrams) + "\t" + occurrence + "\n")
+        ngrams_to_word(occurrences, 3, outfile, remove_duplicates=remove_duplicates)
     elif cue_structure == "word_to_word":
         for cues, outcomes in occurrences:
             if not cues:
                 continue
             if remove_duplicates:
-                outfile.write("_".join(set(cues.split("_"))) + "\t" + outcomes + "\n")
+                outfile.write("{}\t{}\n".format("_".join(set(cues.split("_"))), outcomes))
             else:
-                outfile.write(cues + "\t" + outcomes + "\n")
+                outfile.write("{}\t{}\n".format(cues, outcomes))
     else:
         raise NotImplementedError('cue_structure=%s is not implemented yet.' % cue_structure)
 
@@ -761,9 +767,9 @@ def create_binary_event_files(event_file,
 
         def _error_callback(error):
             if isinstance(error, StopIteration):
-                msg, result = error.value
+                _, result = error.value
                 nonlocal number_events
-                number_events += result
+                number_events += result  # pylint: disable=undefined-variable
                 pool.close()
             else:
                 raise error

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,10 +1,12 @@
-
 '''
 Configuration for py.test-3.
 
 '''
 
 
 def pytest_addoption(parser):
+    """
+    adds custom option to the pytest parser
+    """
     parser.addoption("--runslow", action="store_true",
                      help="run slow tests")
diff --git a/tests/test_activation.py b/tests/test_activation.py
@@ -15,7 +15,7 @@
 from pyndl import ndl
 from pyndl.activation import activation
 
-slow = pytest.mark.skipif(not pytest.config.getoption("--runslow"),
+slow = pytest.mark.skipif(not pytest.config.getoption("--runslow"),  # pylint: disable=invalid-name
                           reason="need --runslow option to run")
 
 TEST_ROOT = os.path.join(os.path.pardir, os.path.dirname(__file__))
@@ -29,8 +29,8 @@
 
 def test_exceptions():
     with pytest.raises(ValueError) as e_info:
-        wm = ndl.dict_ndl(FILE_PATH_SIMPLE, ALPHA, BETAS, remove_duplicates=None)
-        activation(FILE_PATH_MULTIPLE_CUES, wm)
+        weights = ndl.dict_ndl(FILE_PATH_SIMPLE, ALPHA, BETAS, remove_duplicates=None)
+        activation(FILE_PATH_MULTIPLE_CUES, weights)
         assert e_info == 'cues or outcomes needs to be unique: cues "a a"; outcomes "A"; use remove_duplicates=True'
 
     with pytest.raises(ValueError) as e_info:
@@ -149,22 +149,22 @@ def test_activation_matrix_large():
     print("")
     print("Start setup...")
 
-    def time_test(func, of=""):
+    def time_test(func, of=""):  # pylint: disable=invalid-name
         def dec_func(*args, **kwargs):
             print("start test '{}'".format(of))
-            st = time.clock()
+            start = time.clock()
             res = func(*args, **kwargs)
-            et = time.clock()
+            end = time.clock()
             print("finished test '{}'".format(of))
-            print("  duration: {:.3f}s".format(et-st))
+            print("  duration: {:.3f}s".format(end - start))
             print("")
             return res
         return dec_func
 
-    n = 2000
-    n_cues = 10*n
-    n_outcomes = n
-    n_events = 10*n
+    nn = 2000
+    n_cues = 10*nn
+    n_outcomes = nn
+    n_events = 10*nn
     n_cues_per_event = 30
     weight_mat = np.random.rand(n_cues, n_outcomes)
     cues = ['c'+str(i) for i in range(n_cues)]

diff --git a/tests/test_count.py b/tests/test_count.py
@@ -33,7 +33,7 @@ def test_words_symbols():
 
 def test_save_load():
     file_name = os.path.join(TEST_ROOT, "temp/cues.tab")
-    n_events, cues, outcomes = count.cues_outcomes(EVENT_RESOURCE_FILE)
+    _, cues, _ = count.cues_outcomes(EVENT_RESOURCE_FILE)
     count.save_counter(cues, file_name)
     cues_loaded = count.load_counter(file_name)
     assert cues == cues_loaded