From 1f2b122290eefb89ce663576b377bbf5fa742d9a Mon Sep 17 00:00:00 2001
From: p-goulart <pedro.goulart@languagetool.com>
Date: Thu, 18 Jan 2024 13:14:20 +0100
Subject: [PATCH] Refactor for more efficient pycharm work

 - i mean, not just pycharm, but yeah, i've renamed the temp files so
   it's easier to debug them by grepping or whatever;

 - moved some stuff around the modules;

 - added the 'validate_aff.py' script just to make sure the .aff files
   are okay (gpt did most of it, so dont trust it).
---
 lib/dic_chunk.py                | 30 ++++++++++++-------
 lib/languagetool_utils.py       |  5 +++-
 lib/shell_command.py            |  7 +++--
 lib/utils.py                    | 19 +++++++-----
 lib/variant.py                  |  3 ++
 scripts/build_spelling_dicts.py | 10 +++++--
 scripts/build_tagger_dicts.py   |  8 ++++-
 scripts/validate_aff.py         | 53 +++++++++++++++++++++++++++++++++
 8 files changed, 108 insertions(+), 27 deletions(-)
 create mode 100644 scripts/validate_aff.py

diff --git a/lib/dic_chunk.py b/lib/dic_chunk.py
index b8f7616..330a3bc 100644
--- a/lib/dic_chunk.py
+++ b/lib/dic_chunk.py
@@ -6,6 +6,7 @@
 from lib.constants import LATIN_1_ENCODING
 from lib.logger import LOGGER
 from lib.shell_command import ShellCommand
+from lib.variant import Variant
 
 
 class DicChunk:
@@ -13,11 +14,13 @@ class DicChunk:
 
     Attributes:
         filepath (str): the path to the chunk
+        name (str): the name of the chunk (e.g. chunk0)
         compounds (bool): whether this is a file containing compounds or not; if True, this chunk will *not* be
                           tokenised;
     """
-    def __init__(self, filepath: str, compounds: bool = False):
+    def __init__(self, filepath: str, name: str, compounds: bool = False):
         self.filepath = filepath
+        self.name = name
         self.compounds = compounds
 
     def __str__(self) -> str:
@@ -32,20 +35,27 @@ def rm(self) -> None:
         shutil.rmtree(self.filepath)
 
     @classmethod
-    def from_hunspell_dic(cls, dic_path: str, chunk_size: int, target_dir: str, sample_size: int) -> List:
+    def from_hunspell_dic(cls, variant: Variant, chunk_size: int, target_dir: str, sample_size: int,
+                          compounds: bool = False) -> List:
         """Splits a dictionary file into smaller files (chunks) of a given number of lines.
 
         Args:
-            dic_path (str): the path to the Hunspell .dic file
+            variant (Variant): the variant for which we want to unmunch the .dic file
             chunk_size (int): the number of lines per chunk
             target_dir (str): the directory where the chunks will be saved
             sample_size (int): the number of lines to read from the dictionary file; if 0 or negative, read all lines
+            compounds (bool): whether this is a file containing compounds or not
 
         Returns:
             A list of DicChunk objects, each representing a chunk of the dictionary file
         """
+        if compounds:
+            tmp_dir = path.join(target_dir, 'compounds')
+            dic_path = variant.compounds()
+        else:
+            tmp_dir = target_dir
+            dic_path = variant.dic()
         LOGGER.debug(f"Splitting dictionary file \"{dic_path}\" into chunks...")
-        compounds = (True if 'compounds' in dic_path else False)
         with open(dic_path, 'r', encoding=LATIN_1_ENCODING) as dic_file:
             lines = dic_file.readlines()[1:]  # Skip the first line
         lines = [line for line in lines if not line.startswith("#")]  # Filter out comment lines
@@ -55,17 +65,14 @@ def from_hunspell_dic(cls, dic_path: str, chunk_size: int, target_dir: str, samp
         str_chunks: List[List[str]] = [lines[i:i + chunk_size] for i in range(0, total_lines, chunk_size)]
         chunks: List[cls] = []
         for index, chunk in enumerate(str_chunks):
-            if compounds:
-                tmp_dir = path.join(target_dir, 'compounds')
-            else:
-                tmp_dir = target_dir
-            filename = path.basename(dic_path).replace('.dic', f'_chunk{index}.dic')
+            chunk_name = f"{variant.underscored}_chunk{index}"
+            filename = chunk_name + ".dic"
             chunk_path = path.join(tmp_dir, filename)
             with open(chunk_path, 'w', encoding=LATIN_1_ENCODING) as chunk_file:
                 # Prepend the count of lines in this chunk and then write all lines
                 chunk_file.write(f"{len(chunk)}\n")
                 chunk_file.writelines(chunk)
-            chunks.append(cls(chunk_path, compounds))
+            chunks.append(cls(chunk_path, chunk_name, compounds))
         LOGGER.debug(f"Split into {len(chunks)} chunks.")
         return chunks
 
@@ -79,7 +86,8 @@ def unmunch(self, aff_path: str, delete_tmp: bool = False) -> NamedTemporaryFile
         Returns:
             the temp file containing the unmunched dictionary
         """
-        unmunched_tmp = NamedTemporaryFile(delete=delete_tmp, mode='wb')
+        unmunched_tmp = NamedTemporaryFile(delete=delete_tmp, mode='wb',
+                                           prefix=f"{self.name}_unmunched_")
         LOGGER.debug(f"Unmunching {self} into {unmunched_tmp.name} ...")
         cmd_unmunch = f"unmunch {self.filepath} {aff_path}"
         unmunch_result = ShellCommand(cmd_unmunch).run()
diff --git a/lib/languagetool_utils.py b/lib/languagetool_utils.py
index b64b38e..6420d70 100644
--- a/lib/languagetool_utils.py
+++ b/lib/languagetool_utils.py
@@ -1,3 +1,4 @@
+import re
 from tempfile import NamedTemporaryFile
 from typing import List
 
@@ -33,7 +34,9 @@ def tokenise(self, unmunched_file: NamedTemporaryFile) -> NamedTemporaryFile:
             a NamedTemporaryFile with the result of tokenisation written to it; note this is a UTF-8-encoded file; it is
             not at this stage that we move from latin-1 encoding to UTF-8.
         """
-        tokenised_tmp = NamedTemporaryFile(delete=self.delete_tmp, mode='w')
+        chunk_pattern = re.compile("[a-z]{2}_[A-Z]{2}(?:_[a-zA-Z0-9]+)?_chunk\\d+")
+        prefix = chunk_pattern.findall(unmunched_file.name.split('/')[-1])[0] + "_tokenised_"
+        tokenised_tmp = NamedTemporaryFile(delete=self.delete_tmp, mode='w', prefix=prefix)
         LOGGER.debug(f"Tokenising {unmunched_file.name} into {tokenised_tmp.name} ...")
         tokenise_cmd = (
             f"java -cp {LT_JAR_PATH}:"
diff --git a/lib/shell_command.py b/lib/shell_command.py
index e3243e9..278a79c 100644
--- a/lib/shell_command.py
+++ b/lib/shell_command.py
@@ -16,10 +16,11 @@ def __init__(self, return_code: int, stderr: AnyStr = None):
 
 class ShellCommand:
     """A class for executing Java commands."""
-    def __init__(self, command_str: str, env: dict = None):
+    def __init__(self, command_str: str, env: dict = None, cwd: str = '.'):
         self.command_str = command_str
         self.split_cmd = shlex.split(self.command_str)
         self.env: dict = {**os.environ}
+        self.cwd = cwd
         if env is not None:
             self.env.update(env)
 
@@ -33,13 +34,13 @@ def check_status(return_code: int, stderr: AnyStr) -> None:
     def _popen(self, text: bool = False) -> subprocess.Popen:
         try:
             return subprocess.Popen(self.split_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
-                                    stderr=subprocess.PIPE, text=text, env=self.env)
+                                    stderr=subprocess.PIPE, text=text, env=self.env, cwd=self.cwd)
         except FileNotFoundError:
             raise ShellCommandException(255, "Command or file not found.")
 
     def _run(self) -> subprocess.run:
         try:
-            return subprocess.run(self.split_cmd, capture_output=True, env=self.env)
+            return subprocess.run(self.split_cmd, capture_output=True, env=self.env, cwd=self.cwd)
         except FileNotFoundError:
             raise ShellCommandException(255, "Command or file not found.")
 
diff --git a/lib/utils.py b/lib/utils.py
index c3bd7db..ec91e00 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -1,11 +1,11 @@
 import codecs
 import shutil
 from datetime import timedelta
-from os import chdir, path
+from os import path
 from tempfile import NamedTemporaryFile
 from typing import Optional
 
-from lib.constants import REPO_DIR, LT_DIR, JAVA_RESULTS_DIR, LATIN_1_ENCODING
+from lib.constants import LT_DIR, JAVA_RESULTS_DIR, LATIN_1_ENCODING
 from lib.shell_command import ShellCommand
 from lib.logger import LOGGER
 
@@ -13,23 +13,26 @@
 def compile_lt_dev():
     """Build with maven in the languagetool-dev directory."""
     LOGGER.info("Compiling LT dev...")
-    chdir(path.join(LT_DIR, "languagetool-dev"))
-    ShellCommand("mvn clean compile assembly:single").run()
-    chdir(REPO_DIR)  # Go back to the repo directory
+    wd = path.join(LT_DIR, "languagetool-dev")
+    ShellCommand("mvn clean compile assembly:single", cwd=wd).run()
+
+
+def compile_lt():
+    """Build with maven in the languagetool-dev directory."""
+    LOGGER.info("Compiling LT...")
+    ShellCommand("mvn clean install -DskipTests", cwd=LT_DIR).run()
 
 
 def install_dictionaries(custom_version: Optional[str]):
     """Install our dictionaries to the local ~/.m2."""
     LOGGER.info("Installing dictionaries...")
-    chdir(JAVA_RESULTS_DIR)
     env: dict = {}
     if custom_version is not None:
         LOGGER.info(f"Installing custom version \"{custom_version}\"")
         env['PT_DICT_VERSION'] = custom_version
     else:
         LOGGER.info(f"Installing environment-defined version \"{env['PT_DICT_VERSION']}\"")
-    ShellCommand("mvn clean install", env=env).run()
-    chdir(REPO_DIR)  # Go back to the repo directory
+    ShellCommand("mvn clean install", env=env, cwd=JAVA_RESULTS_DIR).run()
 
 
 def convert_to_utf8(tmp_file: NamedTemporaryFile, delete_tmp: bool = False) -> NamedTemporaryFile:
diff --git a/lib/variant.py b/lib/variant.py
index e941d09..dfb6c27 100644
--- a/lib/variant.py
+++ b/lib/variant.py
@@ -31,6 +31,9 @@ def __init__(self, locale_code: str):
     def __str__(self) -> str:
         return self.hyphenated
 
+    def __repr__(self) -> str:
+        return self.hyphenated
+
     def aff(self) -> str:
         return path.join(HUNSPELL_DIR, f"{self.underscored}.aff")
 
diff --git a/scripts/build_spelling_dicts.py b/scripts/build_spelling_dicts.py
index 40c8696..8661b16 100644
--- a/scripts/build_spelling_dicts.py
+++ b/scripts/build_spelling_dicts.py
@@ -9,7 +9,7 @@
 from lib.dic_chunk import DicChunk
 from lib.logger import LOGGER
 from lib.constants import SPELLING_DICT_DIR
-from lib.utils import compile_lt_dev, install_dictionaries, convert_to_utf8, pretty_time_delta
+from lib.utils import compile_lt_dev, install_dictionaries, convert_to_utf8, pretty_time_delta, compile_lt
 from lib.variant import Variant, VARIANT_MAPPING
 from lib.languagetool_utils import LanguageToolUtils as LtUtils
 
@@ -83,7 +83,11 @@ def main():
         f"CUSTOM_INSTALL_VERSION: {CUSTOM_INSTALL_VERSION}\n"
         f"DIC_VARIANTS: {DIC_VARIANTS}\n"
     )
+    # We might consider *always* compiling, since the spelling dicts depends on the tagger dicts having been *installed*
+    # and compiled with LT. The reason we need to also re-build LT is that we need to make sure that OUR tagger dicts
+    # are used by the WordTokenizer.
     if FORCE_COMPILE:
+        compile_lt()
         compile_lt_dev()
     tasks = []
     processed_files: dict[str: List[NamedTemporaryFile]] = {}
@@ -92,8 +96,8 @@ def main():
     # and then split them based on the dialectal and pre/post agreement alternation files
     for variant in DIC_VARIANTS:
         processed_files[variant] = []
-        dic_chunks: List[DicChunk] = DicChunk.from_hunspell_dic(variant.dic(), CHUNK_SIZE, TMP_DIR, SAMPLE_SIZE)
-        dic_chunks.extend(DicChunk.from_hunspell_dic(variant.compounds(), CHUNK_SIZE, TMP_DIR, SAMPLE_SIZE))
+        dic_chunks: List[DicChunk] = DicChunk.from_hunspell_dic(variant, CHUNK_SIZE, TMP_DIR, SAMPLE_SIZE)
+        dic_chunks.extend(DicChunk.from_hunspell_dic(variant, CHUNK_SIZE, TMP_DIR, SAMPLE_SIZE, compounds=True))
         for chunk in dic_chunks:
             tasks.append((variant, chunk))
     LOGGER.info("Starting unmunching and tokenisation process...")
diff --git a/scripts/build_tagger_dicts.py b/scripts/build_tagger_dicts.py
index c6dc9d2..e58d4ba 100644
--- a/scripts/build_tagger_dicts.py
+++ b/scripts/build_tagger_dicts.py
@@ -3,6 +3,7 @@
 """
 import argparse
 import os
+from datetime import datetime
 
 from lib.languagetool_utils import LanguageToolUtils
 from lib.logger import LOGGER
@@ -10,7 +11,7 @@
                            SORTED_POS_DICT_FILEPATH, POS_DICT_DIFF_FILEPATH, OLD_POS_DICT_FILEPATH, REPO_DIR,
                            TAGGER_DICT_DIR, LT_RESULTS_DIR)
 from lib.shell_command import ShellCommand
-from lib.utils import compile_lt_dev, install_dictionaries
+from lib.utils import compile_lt_dev, install_dictionaries, pretty_time_delta
 from lib.variant import Variant
 
 
@@ -59,6 +60,8 @@ def run_shell_script() -> None:
 
 
 def main():
+    start_time = datetime.now()
+    LOGGER.debug(f"Started at {start_time.strftime('%r')}")
     if FORCE_COMPILE:
         compile_lt_dev()
     run_shell_script()
@@ -67,6 +70,9 @@ def main():
     lt.build_synth_binary()
     if FORCE_INSTALL:
         install_dictionaries(custom_version=CUSTOM_INSTALL_VERSION)
+    end_time = datetime.now()
+    LOGGER.debug(f"Finished at {end_time.strftime('%r')}. "
+                 f"Total time elapsed: {pretty_time_delta(end_time - start_time)}.")
 
 
 if __name__ == "__main__":
diff --git a/scripts/validate_aff.py b/scripts/validate_aff.py
new file mode 100644
index 0000000..f5bf35f
--- /dev/null
+++ b/scripts/validate_aff.py
@@ -0,0 +1,53 @@
+"""This was mostly made by chatgpt but of course i had to fix it because AI is stoopid."""
+import sys
+import re
+
+
+def validate_hunspell_aff(file_content):
+    lines = file_content.split('\n')
+    valid = True
+    errors = []
+
+    i = 0
+    while i < len(lines):
+        line = lines[i].strip()
+        if line.startswith("SFX") or line.startswith("PFX"):
+            parts = line.split()
+            if len(parts) >= 4 and parts[2] == 'Y':
+                rule_count = int(parts[3])
+                rule_type = parts[0]
+                rule_name = parts[1]
+                rule_lines = 0
+                rule_start_line = i
+                i += 1
+                same_block_pattern = re.compile(f"{rule_type}\\s+{rule_name}")
+                while i < len(lines) and same_block_pattern.search(lines[i]):
+                    if not lines[i].strip().startswith("#"):
+                        rule_lines += 1
+                    i += 1
+
+                if rule_lines != rule_count:
+                    valid = False
+                    errors.append(f"Rule {rule_type} {rule_name} at line {rule_start_line + 1}: "
+                                  f"Expected {rule_count} rules, found {rule_lines}")
+                continue
+        i += 1
+
+    return valid, errors
+
+
+def validate_hunspell_aff_file(filepath):
+    try:
+        with open(filepath, 'r', encoding='latin-1') as file:
+            file_content = file.read()
+    except FileNotFoundError:
+        return False, ["File not found."]
+    except UnicodeDecodeError:
+        return False, ["File encoding issue. Ensure the file is in LATIN-1 encoding."]
+    except Exception as e:
+        return False, [str(e)]
+    return validate_hunspell_aff(file_content)
+
+
+if __name__ == '__main__':
+    print(validate_hunspell_aff_file(sys.argv[1]))