Skip to content

Commit

Permalink
Lint
Browse files Browse the repository at this point in the history
Signed-off-by: Olaf Lipinski <[email protected]>
  • Loading branch information
olipinski committed Nov 7, 2023
1 parent 18a58c9 commit f69b95d
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 16 deletions.
32 changes: 26 additions & 6 deletions emlangkit/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,12 @@ def boundaries(self, return_count: bool = False, return_mean: bool = False):

return self.__boundaries

def random_boundaries(self, return_count: bool = False, return_mean: bool = False, recompute: bool = False):
def random_boundaries(
self,
return_count: bool = False,
return_mean: bool = False,
recompute: bool = False,
):
if self.__random_boundaries is None and not recompute:
if self.__boundaries is None:
self.boundaries()
Expand Down Expand Up @@ -345,7 +350,12 @@ def segments(self, return_ids: bool = False, return_hashed_segments: bool = Fals

return self.__segments

def random_segments(self, return_ids: bool = False, return_hashed_segments: bool = False, recompute: bool = False):
def random_segments(
self,
return_ids: bool = False,
return_hashed_segments: bool = False,
recompute: bool = False,
):
if self.__random_segments is None and not recompute:
if self.__random_boundaries is None and not recompute:
self.random_boundaries()
Expand All @@ -362,7 +372,11 @@ def random_segments(self, return_ids: bool = False, return_hashed_segments: bool
return self.__random_segments, self.__random_hashed_segments

if return_ids and return_hashed_segments:
return self.__random_segments, self.__random_segment_ids, self.__random_hashed_segments
return (
self.__random_segments,
self.__random_segment_ids,
self.__random_hashed_segments,
)

return self.__random_segments

Expand All @@ -381,7 +395,9 @@ def has_stats(self, compute_topsim: bool = False) -> dict:
# and has no effect on the distance measurement
if compute_topsim:
padded_hashed_segments = utils.pad_jagged(self.__hashed_segments)
padded_random_hashed_segments = utils.pad_jagged(self.__random_hashed_segments)
padded_random_hashed_segments = utils.pad_jagged(
self.__random_hashed_segments
)

self.__has_stats = {
"vocab_size": len(self.__segment_ids),
Expand All @@ -390,15 +406,19 @@ def has_stats(self, compute_topsim: bool = False) -> dict:
# We use hamming here, as the segments could contain multiple characters
# So editdistance would give us a worse estimate
"topographic_similarity": metrics.compute_topographic_similarity(
padded_hashed_segments, self.observations, message_dist_metric="hamming"
padded_hashed_segments,
self.observations,
message_dist_metric="hamming",
)
if compute_topsim
else None,
"random_vocab_size": len(self.__random_segment_ids),
"random_zla": random_zla,
"random_zipf": random_freq,
"random_topographic_similarity": metrics.compute_topographic_similarity(
padded_random_hashed_segments, self.observations, message_dist_metric="hamming"
padded_random_hashed_segments,
self.observations,
message_dist_metric="hamming",
)
if compute_topsim
else None,
Expand Down
7 changes: 5 additions & 2 deletions emlangkit/metrics/topsim.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Calculate topographic similarity for a given language."""
from typing import Tuple, Literal
from typing import Literal, Tuple

import editdistance
import numpy as np
Expand All @@ -8,7 +8,10 @@


def compute_topographic_similarity(
messages: np.ndarray, observations: np.ndarray, observations_dist_metric: str = "hamming", message_dist_metric: str = "editdistance"
messages: np.ndarray,
observations: np.ndarray,
observations_dist_metric: str = "hamming",
message_dist_metric: str = "editdistance",
) -> Tuple[float, float]:
"""
Calculate the topographic similarity between the given messages and observations.
Expand Down
4 changes: 1 addition & 3 deletions emlangkit/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
"""Root __init__ of the utils."""
from emlangkit.utils.array_ops import pad_jagged

__all__ = [
"pad_jagged"
]
__all__ = ["pad_jagged"]
7 changes: 4 additions & 3 deletions emlangkit/utils/array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np


def pad_jagged(array: np.ndarray, fill: int = 0) -> np.ndarray:
"""
Append the minimal required amount of a given integer at the end of each array, such that it looses its jagedness.
Expand All @@ -21,7 +22,7 @@ def pad_jagged(array: np.ndarray, fill: int = 0) -> np.ndarray:
"""
maxlen = max(len(r) for r in array)
padded = np.full((len(array), maxlen),fill_value=fill)
padded = np.full((len(array), maxlen), fill_value=fill)
for enu, row in enumerate(array):
padded[enu, :len(row)] += row
return padded
padded[enu, : len(row)] += row
return padded
3 changes: 1 addition & 2 deletions tests/test_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,10 @@ def test_language_metrics():
lang.conditional_entropy()
lang.boundaries(return_count=True, return_mean=True)
lang.random_boundaries(return_count=True, return_mean=True)
lang.segments(return_ids=True,return_hashed_segments=True)
lang.segments(return_ids=True, return_hashed_segments=True)
lang.random_segments(return_ids=True, return_hashed_segments=True)
lang.has_stats(compute_topsim=True)

# Test recomputing random stats
lang.random_boundaries(recompute=True)
lang.random_segments(recompute=True)

0 comments on commit f69b95d

Please sign in to comment.