From 5a89c665d9b9d9c21e0d00c34c6a32bd886c6199 Mon Sep 17 00:00:00 2001
From: Colin Daglish <colin.daglish@ons.gov.uk>
Date: Tue, 8 Aug 2023 17:08:45 +0100
Subject: [PATCH] add unit tests for streamlit.py

---
 src/modules/streamlit.py        |  34 +++++-
 streamlit_app.py                |   2 +-
 tests/modules/test_streamlit.py | 202 ++++++++++++++++++++++++++++++++
 3 files changed, 231 insertions(+), 7 deletions(-)
 create mode 100644 tests/modules/test_streamlit.py

diff --git a/src/modules/streamlit.py b/src/modules/streamlit.py
index 4ffeb18..a72f5a0 100644
--- a/src/modules/streamlit.py
+++ b/src/modules/streamlit.py
@@ -286,7 +286,7 @@ def add_label_formatting(replacement_dict: dict, topic_sample: DataFrame) -> lis
     formatted_text = []
     for sample in topic_sample["responses"]:
         for key, value in replacement_dict.items():
-            sample = re.sub(rf"\s\b{key}\b", f" {value}", sample)
+            sample = re.sub(rf"\b{key}\b", f"{value}", sample)
         formatted_text.append([sample])
     return formatted_text
 
@@ -313,7 +313,11 @@ def get_single_topic_color(topic_names: list, topic_name: str) -> str:
 
 
 def single_topic_formatting(
-    top_n_words: Series, topic_sample: DataFrame, topic_name: str, topic_color: str
+    top_n_words: Series,
+    topic_sample: DataFrame,
+    topic_name: str,
+    topic_color: str,
+    stopwords: list,
 ) -> list:
     """Creates a streamlit annotate formatting setup for single topic
 
@@ -327,6 +331,8 @@ def single_topic_formatting(
         name of the topic
     topic_color: str
         hex code for the topic
+    stopwords:list
+        list of inconsequential words removed from corpus during cleaning
 
     Returns
     -------
@@ -334,11 +340,11 @@ def single_topic_formatting(
         a formatted list of strings and tuples
     """
     pattern_behind = r"[\s,](?=\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\])"
-    pattern_ahead = r"(?<='#[a-zA-Z0-9]{6}'])[\s]"
+    pattern_ahead = r"(?<='#[a-zA-Z0-9]{6}'])[\s,]"
     pattern_combined = "|".join([pattern_behind, pattern_ahead])
-    top_n_words_x = top_n_words
-    replacements = [[i, topic_name, topic_color] for i in list(top_n_words)]
-    replacement_dict = dict(zip(top_n_words_x, replacements))
+    word_stopword_combos = create_word_stopword_combos(top_n_words, stopwords)
+    replacements = [[i, topic_name, topic_color] for i in list(word_stopword_combos)]
+    replacement_dict = dict(zip(word_stopword_combos, replacements))
     initial_formatted = add_label_formatting(replacement_dict, topic_sample)
     for idx in range(len(initial_formatted)):
         split_string = re.split(pattern_combined, initial_formatted[idx][0])
@@ -347,6 +353,22 @@ def single_topic_formatting(
     return initial_formatted
 
 
+# Series.reset_index()
+# test_data = topic_sample["responses"][1]
+
+# reindexed_top_words = top_n_words.reset_index(drop = True).reset_index()
+# reindexed_top_words["n_words"] = reindexed_top_words.word.apply(n_words)
+# sorted_top_words = reindexed_top_words.sort_values(
+#     ["n_words", "index"], ascending = False).word
+
+
+# for phrase in sorted_top_words:
+#     test_data = re.sub(phrase, snake_case(phrase), test_data)
+
+
+# def n_words(phrase):
+#     words = phrase.split()
+#     return len(words)
 def multitopic_formatting(
     dominant_topics: DataFrame, topic_sample: DataFrame, topic_names: list
 ) -> list:
diff --git a/streamlit_app.py b/streamlit_app.py
index fe8f9cf..486c78e 100644
--- a/streamlit_app.py
+++ b/streamlit_app.py
@@ -321,7 +321,7 @@
     word_stopword_combos = stream.create_word_stopword_combos(top_n_words, stopwords)
     topic_color = stream.get_single_topic_color(topic_names, topic_name)
     formatted_topic_single = stream.single_topic_formatting(
-        word_stopword_combos, topic_sample, topic_name, topic_color
+        top_n_words, topic_sample, topic_name, topic_color, stopwords
     )
     formatted_text = stream.multitopic_formatting(
         dominant_topics, topic_sample, topic_names
diff --git a/tests/modules/test_streamlit.py b/tests/modules/test_streamlit.py
new file mode 100644
index 0000000..43526c6
--- /dev/null
+++ b/tests/modules/test_streamlit.py
@@ -0,0 +1,202 @@
+import re
+from importlib import reload
+
+from pandas import DataFrame, Series
+
+# from src.modules import preprocessing as prep
+from src.modules import streamlit as stream
+
+reload(stream)
+
+
+class TestGetNTopWords:
+    def test_get_n_top_words(self):
+
+        test_df = DataFrame(
+            {
+                "topic_1_word_importance": [0, 1, 2],
+                "topic_2_word_importance": [0, 0, 0],
+                "word": ["alpha", "bravo", "charlie"],
+            }
+        )
+        actual = stream.get_top_n_words(topic_words=test_df, n=2, topic_name="Topic 1")
+        expected = Series(["bravo", "charlie"], index=[1, 2])
+        assert all(actual == expected)
+
+
+class TestIdentifyDominantTopics:
+    def test_identify_dominant_topics(self):
+        topic_names_snake = ["topic_1", "topic_2", "topic_3"]
+        test_df = DataFrame(
+            {
+                "word": ["alpha", "bravo", "charlie"],
+                "topic_1": [0, 1, 2],
+                "topic_2": [2, 3, 4],
+                "topic_3": [3, 2, 1],
+            }
+        )
+        actual = stream.identify_dominant_topics(
+            topic_words=test_df, topic_names_snake=topic_names_snake
+        )
+        expected = DataFrame(
+            {
+                "word": ["alpha", "bravo", "charlie"],
+                "variable": ["topic_3", "topic_2", "topic_2"],
+            }
+        )
+        assert all(actual == expected)
+
+
+class TestSnakeCase:
+    def test_snake_case(self):
+        actual = stream.snake_case("This string")
+        expected = "this_string"
+        assert actual == expected
+
+
+class TestGetNTopicSamples:
+    def test_get_n_topic_samples(self):
+        test_df = DataFrame(
+            {
+                "responses": ["hello word", "world hello", "hello hello"],
+                "topic_1": [0, 2, 1],
+            }
+        )
+        actual = stream.get_n_topic_samples(
+            text_with_topic_df=test_df, topic_name="Topic_1", n=2
+        )
+        expected = DataFrame(
+            {"responses": ["world hello", "hello hello"], "topic_1": [2, 1]}
+        )
+        assert all(actual == expected)
+
+
+class TestGetResponseNo:
+    def test_get_response_no(self):
+        test_df = DataFrame(
+            {
+                "responses": ["hello word", "world hello", "hello hello"],
+                "index": [455, 12, 11],
+            }
+        )
+        actual = stream.get_response_no(topic_sample=test_df, position=1)
+        expected = "Response 12"
+        assert actual == expected
+
+
+class TestGenerateTopScores:
+    def test_generate_top_scores(self):
+        test_df = DataFrame(
+            {
+                "responses": ["hello word", "world hello", "hello hello"],
+                "index": [53, 22, 12],
+                "topic_1": [0.1, 0.3, 0.01],
+                "topic_2": [0.12, 0.22, 0.32],
+            }
+        )
+        actual = stream.generate_top_scores(
+            topic_sample=test_df, topic_name="Topic 1", position=1
+        )
+        expected = "(Topic 1; Score: 30.0%)   (Topic 2; Score: 22.0%)"
+        assert actual == expected
+
+
+class TestGetHexColors:
+    def test_get_hex_colors_is_hex(self):
+        actual = stream.get_hex_colors(n_colors=1)
+        assert re.match(r"#[a-zA-Z0-9]{6}", actual[0]), "does not match hex pattern"
+
+    def test_get_hex_colors_n_returns(self):
+        actual = stream.get_hex_colors(n_colors=4)
+        assert len(actual) == 4
+        actual = stream.get_hex_colors(n_colors=2)
+        assert len(actual) == 2
+
+
+class TestGetFormattingTuple:
+    def test_get_formatting_tuple(self):
+        test_dominant_topics = DataFrame(
+            {"variable": ["topic_1", "topic_2"]}, index=["hello", "world"]
+        )
+        test_topic_color_dict = {"Topic 1": "#000000", "Topic 2": "#999999"}
+        actual = stream.create_formatting_tuple(
+            dominant_topics=test_dominant_topics,
+            word="hello",
+            topic_color_dict=test_topic_color_dict,
+        )
+
+        expected = ("hello", "Topic 1", "#000000")
+        assert actual == expected
+
+
+class TestCreateWordStopWordCombos:
+    def test_create_word_stopword_combo(self):
+        test_stopwords = ["he", "her"]
+        test_words = Series(["hello world", "hello"], index=[21, 42])
+        actual = stream.create_word_stopword_combos(
+            top_n_words=test_words, stopwords=test_stopwords
+        )
+        expected = ["hello he world", "hello her world", "hello world", "hello"]
+        assert actual == expected
+
+
+class TestInsertTuple:
+    def test_insert_tuple(self):
+        test_split_string = [
+            "hello my name",
+            "['word', 'Topic 1', '#000000']",
+            "is world",
+        ]
+        actual = stream.insert_tuple(split_string=test_split_string)
+        expected = ["hello my name", ("word", "Topic 1", "#000000"), "is world"]
+        assert actual == expected
+
+
+class TestAddLabelFormatting:
+    def test_add_label_formatting(self):
+        test_df = DataFrame(
+            {"responses": ["hello world how are you", "my name is world"]}
+        )
+        replacement_dict = {"world": "['world', 'Topic 1', '#000000']"}
+        actual = stream.add_label_formatting(
+            replacement_dict=replacement_dict, topic_sample=test_df
+        )
+        expected = [
+            ["hello ['world', 'Topic 1', '#000000'] how are you"],
+            ["my name is ['world', 'Topic 1', '#000000']"],
+        ]
+        assert actual == expected
+
+
+class TestGetSingleTopicColor:
+    def test_get_single_topic_color(self):
+        test_topic_names = ["Topic 1", "Topic 2"]
+        topic_1 = stream.get_single_topic_color(
+            topic_names=test_topic_names, topic_name="Topic 1"
+        )
+        topic_2 = stream.get_single_topic_color(
+            topic_names=test_topic_names, topic_name="Topic 2"
+        )
+        assert topic_1 != topic_2
+
+
+# class TestSingleTopicFormatting:
+#     def test_single_topic_formatting(self):
+#         top_n_words = Series(["hello", "world"])
+#         topic_sample = DataFrame({"responses": ["hello world how are you",
+#                                                 "hi world you are my oyster",
+#                                                 "hello my world how are you"],
+#                                    "topic_1": [0.9, 0.3, 0.2],
+#                                    "topic_2": [0.01, 0.8,0.6]})
+#         topic_name = "Topic 1"
+#         topic_color = "#000000"
+#         stopwords = prep.initialise_update_stopwords(["he"])
+#         actual = stream.single_topic_formatting(top_n_words= top_n_words,
+#                                        topic_sample = topic_sample,
+#                                        topic_name= "Topic 1",
+#                                        topic_color = "#000000",
+#                                        stopwords=stopwords)
+#         expected = [[('hello', 'Topic 1', '#000000'), \
+#                      ('world', 'Topic 1', '#000000'), 'how are you'],
+#                      ["hi ['world', 'Topic 1', '#000000']", 'you are my oyster']]
+#         assert actual == expected