From 5a89c665d9b9d9c21e0d00c34c6a32bd886c6199 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Tue, 8 Aug 2023 17:08:45 +0100 Subject: [PATCH 1/3] add unit tests for streamlit.py --- src/modules/streamlit.py | 34 +++++- streamlit_app.py | 2 +- tests/modules/test_streamlit.py | 202 ++++++++++++++++++++++++++++++++ 3 files changed, 231 insertions(+), 7 deletions(-) create mode 100644 tests/modules/test_streamlit.py diff --git a/src/modules/streamlit.py b/src/modules/streamlit.py index 4ffeb18..a72f5a0 100644 --- a/src/modules/streamlit.py +++ b/src/modules/streamlit.py @@ -286,7 +286,7 @@ def add_label_formatting(replacement_dict: dict, topic_sample: DataFrame) -> lis formatted_text = [] for sample in topic_sample["responses"]: for key, value in replacement_dict.items(): - sample = re.sub(rf"\s\b{key}\b", f" {value}", sample) + sample = re.sub(rf"\b{key}\b", f"{value}", sample) formatted_text.append([sample]) return formatted_text @@ -313,7 +313,11 @@ def get_single_topic_color(topic_names: list, topic_name: str) -> str: def single_topic_formatting( - top_n_words: Series, topic_sample: DataFrame, topic_name: str, topic_color: str + top_n_words: Series, + topic_sample: DataFrame, + topic_name: str, + topic_color: str, + stopwords: list, ) -> list: """Creates a streamlit annotate formatting setup for single topic @@ -327,6 +331,8 @@ def single_topic_formatting( name of the topic topic_color: str hex code for the topic + stopwords:list + list of inconsequential words removed from corpus during cleaning Returns ------- @@ -334,11 +340,11 @@ def single_topic_formatting( a formatted list of strings and tuples """ pattern_behind = r"[\s,](?=\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\])" - pattern_ahead = r"(?<='#[a-zA-Z0-9]{6}'])[\s]" + pattern_ahead = r"(?<='#[a-zA-Z0-9]{6}'])[\s,]" pattern_combined = "|".join([pattern_behind, pattern_ahead]) - top_n_words_x = top_n_words - replacements = [[i, topic_name, topic_color] for i in list(top_n_words)] - replacement_dict = dict(zip(top_n_words_x, replacements)) + word_stopword_combos = create_word_stopword_combos(top_n_words, stopwords) + replacements = [[i, topic_name, topic_color] for i in list(word_stopword_combos)] + replacement_dict = dict(zip(word_stopword_combos, replacements)) initial_formatted = add_label_formatting(replacement_dict, topic_sample) for idx in range(len(initial_formatted)): split_string = re.split(pattern_combined, initial_formatted[idx][0]) @@ -347,6 +353,22 @@ def single_topic_formatting( return initial_formatted +# Series.reset_index() +# test_data = topic_sample["responses"][1] + +# reindexed_top_words = top_n_words.reset_index(drop = True).reset_index() +# reindexed_top_words["n_words"] = reindexed_top_words.word.apply(n_words) +# sorted_top_words = reindexed_top_words.sort_values( +# ["n_words", "index"], ascending = False).word + + +# for phrase in sorted_top_words: +# test_data = re.sub(phrase, snake_case(phrase), test_data) + + +# def n_words(phrase): +# words = phrase.split() +# return len(words) def multitopic_formatting( dominant_topics: DataFrame, topic_sample: DataFrame, topic_names: list ) -> list: diff --git a/streamlit_app.py b/streamlit_app.py index fe8f9cf..486c78e 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -321,7 +321,7 @@ word_stopword_combos = stream.create_word_stopword_combos(top_n_words, stopwords) topic_color = stream.get_single_topic_color(topic_names, topic_name) formatted_topic_single = stream.single_topic_formatting( - word_stopword_combos, topic_sample, topic_name, topic_color + top_n_words, topic_sample, topic_name, topic_color, stopwords ) formatted_text = stream.multitopic_formatting( dominant_topics, topic_sample, topic_names diff --git a/tests/modules/test_streamlit.py b/tests/modules/test_streamlit.py new file mode 100644 index 0000000..43526c6 --- /dev/null +++ b/tests/modules/test_streamlit.py @@ -0,0 +1,202 @@ +import re +from importlib import reload + +from pandas import DataFrame, Series + +# from src.modules import preprocessing as prep +from src.modules import streamlit as stream + +reload(stream) + + +class TestGetNTopWords: + def test_get_n_top_words(self): + + test_df = DataFrame( + { + "topic_1_word_importance": [0, 1, 2], + "topic_2_word_importance": [0, 0, 0], + "word": ["alpha", "bravo", "charlie"], + } + ) + actual = stream.get_top_n_words(topic_words=test_df, n=2, topic_name="Topic 1") + expected = Series(["bravo", "charlie"], index=[1, 2]) + assert all(actual == expected) + + +class TestIdentifyDominantTopics: + def test_identify_dominant_topics(self): + topic_names_snake = ["topic_1", "topic_2", "topic_3"] + test_df = DataFrame( + { + "word": ["alpha", "bravo", "charlie"], + "topic_1": [0, 1, 2], + "topic_2": [2, 3, 4], + "topic_3": [3, 2, 1], + } + ) + actual = stream.identify_dominant_topics( + topic_words=test_df, topic_names_snake=topic_names_snake + ) + expected = DataFrame( + { + "word": ["alpha", "bravo", "charlie"], + "variable": ["topic_3", "topic_2", "topic_2"], + } + ) + assert all(actual == expected) + + +class TestSnakeCase: + def test_snake_case(self): + actual = stream.snake_case("This string") + expected = "this_string" + assert actual == expected + + +class TestGetNTopicSamples: + def test_get_n_topic_samples(self): + test_df = DataFrame( + { + "responses": ["hello word", "world hello", "hello hello"], + "topic_1": [0, 2, 1], + } + ) + actual = stream.get_n_topic_samples( + text_with_topic_df=test_df, topic_name="Topic_1", n=2 + ) + expected = DataFrame( + {"responses": ["world hello", "hello hello"], "topic_1": [2, 1]} + ) + assert all(actual == expected) + + +class TestGetResponseNo: + def test_get_response_no(self): + test_df = DataFrame( + { + "responses": ["hello word", "world hello", "hello hello"], + "index": [455, 12, 11], + } + ) + actual = stream.get_response_no(topic_sample=test_df, position=1) + expected = "Response 12" + assert actual == expected + + +class TestGenerateTopScores: + def test_generate_top_scores(self): + test_df = DataFrame( + { + "responses": ["hello word", "world hello", "hello hello"], + "index": [53, 22, 12], + "topic_1": [0.1, 0.3, 0.01], + "topic_2": [0.12, 0.22, 0.32], + } + ) + actual = stream.generate_top_scores( + topic_sample=test_df, topic_name="Topic 1", position=1 + ) + expected = "(Topic 1; Score: 30.0%) (Topic 2; Score: 22.0%)" + assert actual == expected + + +class TestGetHexColors: + def test_get_hex_colors_is_hex(self): + actual = stream.get_hex_colors(n_colors=1) + assert re.match(r"#[a-zA-Z0-9]{6}", actual[0]), "does not match hex pattern" + + def test_get_hex_colors_n_returns(self): + actual = stream.get_hex_colors(n_colors=4) + assert len(actual) == 4 + actual = stream.get_hex_colors(n_colors=2) + assert len(actual) == 2 + + +class TestGetFormattingTuple: + def test_get_formatting_tuple(self): + test_dominant_topics = DataFrame( + {"variable": ["topic_1", "topic_2"]}, index=["hello", "world"] + ) + test_topic_color_dict = {"Topic 1": "#000000", "Topic 2": "#999999"} + actual = stream.create_formatting_tuple( + dominant_topics=test_dominant_topics, + word="hello", + topic_color_dict=test_topic_color_dict, + ) + + expected = ("hello", "Topic 1", "#000000") + assert actual == expected + + +class TestCreateWordStopWordCombos: + def test_create_word_stopword_combo(self): + test_stopwords = ["he", "her"] + test_words = Series(["hello world", "hello"], index=[21, 42]) + actual = stream.create_word_stopword_combos( + top_n_words=test_words, stopwords=test_stopwords + ) + expected = ["hello he world", "hello her world", "hello world", "hello"] + assert actual == expected + + +class TestInsertTuple: + def test_insert_tuple(self): + test_split_string = [ + "hello my name", + "['word', 'Topic 1', '#000000']", + "is world", + ] + actual = stream.insert_tuple(split_string=test_split_string) + expected = ["hello my name", ("word", "Topic 1", "#000000"), "is world"] + assert actual == expected + + +class TestAddLabelFormatting: + def test_add_label_formatting(self): + test_df = DataFrame( + {"responses": ["hello world how are you", "my name is world"]} + ) + replacement_dict = {"world": "['world', 'Topic 1', '#000000']"} + actual = stream.add_label_formatting( + replacement_dict=replacement_dict, topic_sample=test_df + ) + expected = [ + ["hello ['world', 'Topic 1', '#000000'] how are you"], + ["my name is ['world', 'Topic 1', '#000000']"], + ] + assert actual == expected + + +class TestGetSingleTopicColor: + def test_get_single_topic_color(self): + test_topic_names = ["Topic 1", "Topic 2"] + topic_1 = stream.get_single_topic_color( + topic_names=test_topic_names, topic_name="Topic 1" + ) + topic_2 = stream.get_single_topic_color( + topic_names=test_topic_names, topic_name="Topic 2" + ) + assert topic_1 != topic_2 + + +# class TestSingleTopicFormatting: +# def test_single_topic_formatting(self): +# top_n_words = Series(["hello", "world"]) +# topic_sample = DataFrame({"responses": ["hello world how are you", +# "hi world you are my oyster", +# "hello my world how are you"], +# "topic_1": [0.9, 0.3, 0.2], +# "topic_2": [0.01, 0.8,0.6]}) +# topic_name = "Topic 1" +# topic_color = "#000000" +# stopwords = prep.initialise_update_stopwords(["he"]) +# actual = stream.single_topic_formatting(top_n_words= top_n_words, +# topic_sample = topic_sample, +# topic_name= "Topic 1", +# topic_color = "#000000", +# stopwords=stopwords) +# expected = [[('hello', 'Topic 1', '#000000'), \ +# ('world', 'Topic 1', '#000000'), 'how are you'], +# ["hi ['world', 'Topic 1', '#000000']", 'you are my oyster']] +# assert actual == expected From 9d7131601dec38c0be8f74fd19e2f2e54d7901bc Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Wed, 9 Aug 2023 11:53:47 +0100 Subject: [PATCH 2/3] fix single topic labels and add unit tests --- src/modules/streamlit.py | 276 ++++++++++++++++++++------------ tests/modules/test_streamlit.py | 219 ++++++++++++++++++------- 2 files changed, 333 insertions(+), 162 deletions(-) diff --git a/src/modules/streamlit.py b/src/modules/streamlit.py index a72f5a0..4483592 100644 --- a/src/modules/streamlit.py +++ b/src/modules/streamlit.py @@ -178,6 +178,68 @@ def generate_top_scores(topic_sample: DataFrame, topic_name: str, position: int) return formatted_text_header +def single_topic_formatting( + top_n_words: Series, + topic_sample: DataFrame, + topic_name: str, + topic_names: list, + stopwords: list, +) -> list: + """Creates a streamlit annotate formatting setup for single topic + + Parameters + ---------- + top_n_words:Series + top n number of words with index numbers + topic_sample: DataFrame + sample of responses ordered by a particular topic + topic_name: str + name of the topic + topic_names: list + list of topic names + stopwords:list + list of inconsequential words removed from corpus during cleaning + + Returns + ------- + list + a formatted list of strings and tuples + """ + color = get_single_topic_color(topic_names, topic_name) + reindexed_top_words = reindex_top_words(top_n_words) + word_stopword_combos = create_word_stopword_combos(reindexed_top_words, stopwords) + replacement_dict = create_formatting_dictionary( + word_stopword_combos, topic_name, color + ) + responses = topic_sample["responses"].apply( + lambda x: insert_formatting_list(x, replacement_dict, word_stopword_combos) + ) + split_responses = responses.apply(split_string_on_list) + formatted_responses = split_responses.apply(insert_tuple) + return list(formatted_responses) + + +def get_single_topic_color(topic_names: list, topic_name: str) -> str: + """get the topic color for a single topic + + Parameters + ---------- + topic_names:list + list of topic names + topic_name:str + the topic name to select a color for + + Returns + ------- + str + hex code for the topic color""" + n_topics = len(topic_names) + topic_colors = get_hex_colors(n_topics).as_hex() + topic_number = [n for n, i in enumerate(topic_names) if i == topic_name] + topic_color = topic_colors[topic_number[0]] + return topic_color + + def get_hex_colors(n_colors: int) -> str: """Get the hex color codes for n_colors number of colors @@ -193,29 +255,40 @@ def get_hex_colors(n_colors: int) -> str: return sns.color_palette(n_colors=n_colors).as_hex() -def create_formatting_tuple( - dominant_topics: DataFrame, word: str, topic_color_dict: dict -) -> tuple: - """create a formatting tuple for streamlit annotation +def reindex_top_words(top_n_words: Series) -> Series: + """re-index top n words by the number of words in the phrase and then the + order of importance Parameters ---------- - dominant_topics:DataFrame - dataframe of words and their strongest associated topic - word:str - word to create tuple for - topic_color_dict:dict - dictionary of topics and their assigned colors + top_n_words:Series + the top n number of words within a given topic Returns ------- - tuple - formatting tuple containing word, topic, and color - """ - topic_x = dominant_topics.loc[word, "variable"] - topic_pretty = re.sub("_", " ", topic_x).capitalize() - topic_color = topic_color_dict[topic_pretty] - return (word, topic_pretty, topic_color) + Series + A reordered version of the same series""" + reindexed_top_words = top_n_words.reset_index(drop=True).reset_index() + reindexed_top_words["n_words"] = reindexed_top_words.word.apply(count_words) + sorted_top_words = reindexed_top_words.sort_values( + ["n_words", "index"], ascending=[False, True] + ).word + return sorted_top_words + + +def count_words(phrase: str) -> int: + """Count the number of words in a phrase + + Parameters + ---------- + phrase:str + + Returns + ------- + int + the number of words in the phrase""" + words = phrase.split() + return len(words) def create_word_stopword_combos(top_n_words: Series, stopwords: list) -> list: @@ -245,130 +318,98 @@ def create_word_stopword_combos(top_n_words: Series, stopwords: list) -> list: return unnested_stopword_combo -def insert_tuple(split_string: list) -> list: - """replace string with streamlit annotate formatting tuple +def create_formatting_dictionary( + word_stopword_combos: list, topic_name: str, topic_color: str +) -> dict: + """Create a lookup dictionary to replace words with formatting instructions Parameters ---------- - split_string:list - list of strings which have been split at tuples + word_stopword_combos:list + list of top_n_words with joining stopword combinations + topic_name:str + the name of the topic + topic_color:str + the hex color code for the topic Returns ------- - list - list of strings and formatting tuples - """ - for n, i in enumerate(split_string): - matcher = re.match(r"\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\]", i) - if matcher: - replacement_tuple = tuple( - re.sub(r"\[|\]|'", "", matcher.group(0)).split(", ") - ) - split_string[n] = replacement_tuple - return split_string + dict + a lookup dictionary for formatting replacements""" + keys = word_stopword_combos + values = [f"['{key}', '{topic_name}', '{topic_color}']" for key in keys] + snake_keys = [snake_case(key) for key in keys] + return dict(zip(snake_keys, values)) -def add_label_formatting(replacement_dict: dict, topic_sample: DataFrame) -> list: - """add streamlit annotate label formatting within string +def insert_formatting_list( + string: str, replacement_dict: dict, word_stopword_combos: list +) -> str: + """insert formatting lookup list at match points for dictionary keys Parameters ---------- + string:str + the string to replace values within replacement_dict:dict - dictionary of values to replace with their tuple replacements - topic_sample: DataFrame - sample of responses ordered by a particular topic + lookup dictionary of replacments + word_stopword_combos:list + list of top_n_words with joining stopword combinations Returns ------- - list - list of strings and formatting tuples + str + string with values replaced with values wrapped in formatting """ - formatted_text = [] - for sample in topic_sample["responses"]: - for key, value in replacement_dict.items(): - sample = re.sub(rf"\b{key}\b", f"{value}", sample) - formatted_text.append([sample]) - return formatted_text + for word in word_stopword_combos: + string = re.sub(rf"\b{word}\b", snake_case(word), string) + for key, value in replacement_dict.items(): + string = re.sub(rf"(? str: - """get the topic color for a single topic +def split_string_on_list(string: str) -> list: + """split string before and after formatting points Parameters ---------- - topic_names:list - list of topic names - topic_name:str - the topic name to select a color for + string:str + the string to split Returns ------- - str - hex code for the topic color""" - n_topics = len(topic_names) - topic_colors = get_hex_colors(n_topics).as_hex() - topic_number = [n for n, i in enumerate(topic_names) if i == topic_name] - topic_color = topic_colors[topic_number[0]] - return topic_color + list + a list of strings split at formatting points""" + pattern_behind = r"[\s,](?=\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\])" + pattern_ahead = r"(?<='#[a-zA-Z0-9]{6}'])[\s,]" + pattern_combined = "|".join([pattern_behind, pattern_ahead]) + split_string = re.split(pattern_combined, string) + return split_string -def single_topic_formatting( - top_n_words: Series, - topic_sample: DataFrame, - topic_name: str, - topic_color: str, - stopwords: list, -) -> list: - """Creates a streamlit annotate formatting setup for single topic +def insert_tuple(split_string: list) -> list: + """replace string with streamlit annotate formatting tuple Parameters ---------- - top_n_words:Series - top n number of words with index numbers - topic_sample: DataFrame - sample of responses ordered by a particular topic - topic_name: str - name of the topic - topic_color: str - hex code for the topic - stopwords:list - list of inconsequential words removed from corpus during cleaning + split_string:list + list of strings which have been split at tuples Returns ------- list - a formatted list of strings and tuples + list of strings and formatting tuples """ - pattern_behind = r"[\s,](?=\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\])" - pattern_ahead = r"(?<='#[a-zA-Z0-9]{6}'])[\s,]" - pattern_combined = "|".join([pattern_behind, pattern_ahead]) - word_stopword_combos = create_word_stopword_combos(top_n_words, stopwords) - replacements = [[i, topic_name, topic_color] for i in list(word_stopword_combos)] - replacement_dict = dict(zip(word_stopword_combos, replacements)) - initial_formatted = add_label_formatting(replacement_dict, topic_sample) - for idx in range(len(initial_formatted)): - split_string = re.split(pattern_combined, initial_formatted[idx][0]) - split_string = insert_tuple(split_string) - initial_formatted[idx] = split_string - return initial_formatted - - -# Series.reset_index() -# test_data = topic_sample["responses"][1] - -# reindexed_top_words = top_n_words.reset_index(drop = True).reset_index() -# reindexed_top_words["n_words"] = reindexed_top_words.word.apply(n_words) -# sorted_top_words = reindexed_top_words.sort_values( -# ["n_words", "index"], ascending = False).word - - -# for phrase in sorted_top_words: -# test_data = re.sub(phrase, snake_case(phrase), test_data) + for n, i in enumerate(split_string): + matcher = re.match(r"\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\]", i) + if matcher: + replacement_tuple = tuple( + re.sub(r"\[|\]|'", "", matcher.group(0)).split(", ") + ) + split_string[n] = replacement_tuple + return split_string -# def n_words(phrase): -# words = phrase.split() -# return len(words) def multitopic_formatting( dominant_topics: DataFrame, topic_sample: DataFrame, topic_names: list ) -> list: @@ -406,3 +447,28 @@ def multitopic_formatting( formatted_response.append(word + " ") formatted_text.append(formatted_response) return formatted_text + + +def create_formatting_tuple( + dominant_topics: DataFrame, word: str, topic_color_dict: dict +) -> tuple: + """create a formatting tuple for streamlit annotation + + Parameters + ---------- + dominant_topics:DataFrame + dataframe of words and their strongest associated topic + word:str + word to create tuple for + topic_color_dict:dict + dictionary of topics and their assigned colors + + Returns + ------- + tuple + formatting tuple containing word, topic, and color + """ + topic_x = dominant_topics.loc[word, "variable"] + topic_pretty = re.sub("_", " ", topic_x).capitalize() + topic_color = topic_color_dict[topic_pretty] + return (word, topic_pretty, topic_color) diff --git a/tests/modules/test_streamlit.py b/tests/modules/test_streamlit.py index 43526c6..5caa498 100644 --- a/tests/modules/test_streamlit.py +++ b/tests/modules/test_streamlit.py @@ -101,6 +101,52 @@ def test_generate_top_scores(self): assert actual == expected +class TestSingleTopicFormatting: + def test_single_topic_formatting(self): + test_top_words = DataFrame({"word": ["hello world", "happy"]}).word + test_topic_sample = DataFrame( + { + "responses": [ + "hello world how are you", + "world hello how am i", + "I am so happy hello my world", + ], + "index": [53, 22, 12], + "topic_1": [0.1, 0.3, 0.01], + "topic_2": [0.12, 0.22, 0.32], + } + ) + actual = stream.single_topic_formatting( + top_n_words=test_top_words, + topic_sample=test_topic_sample, + topic_name="Topic 1", + topic_names=["Topic 1", "Topic 2"], + stopwords=["my"], + ) + expected = [ + [("hello world", "Topic 1", "#1f77b4"), "how are you"], + ["world hello how am i"], + [ + "I am so", + ("happy", "Topic 1", "#1f77b4"), + ("hello my world", "Topic 1", "#1f77b4"), + ], + ] + assert actual == expected + + +class TestGetSingleTopicColor: + def test_get_single_topic_color(self): + test_topic_names = ["Topic 1", "Topic 2"] + topic_1 = stream.get_single_topic_color( + topic_names=test_topic_names, topic_name="Topic 1" + ) + topic_2 = stream.get_single_topic_color( + topic_names=test_topic_names, topic_name="Topic 2" + ) + assert topic_1 != topic_2 + + class TestGetHexColors: def test_get_hex_colors_is_hex(self): actual = stream.get_hex_colors(n_colors=1) @@ -113,20 +159,20 @@ def test_get_hex_colors_n_returns(self): assert len(actual) == 2 -class TestGetFormattingTuple: - def test_get_formatting_tuple(self): - test_dominant_topics = DataFrame( - {"variable": ["topic_1", "topic_2"]}, index=["hello", "world"] - ) - test_topic_color_dict = {"Topic 1": "#000000", "Topic 2": "#999999"} - actual = stream.create_formatting_tuple( - dominant_topics=test_dominant_topics, - word="hello", - topic_color_dict=test_topic_color_dict, +class TestReindexTopWords: + def test_reindex_top_words(self): + test_top_words = Series(["hoppy", "hello world", "happy"], name="word") + actual = stream.reindex_top_words(test_top_words) + expected = Series( + ["hello world", "hoppy", "happy"], index=[1, 0, 2], name="word" ) + assert all(actual == expected) - expected = ("hello", "Topic 1", "#000000") - assert actual == expected + +class TestCountWords: + def test_count_words(self): + assert stream.count_words("hello world") == 2 + assert stream.count_words("hello") == 1 class TestCreateWordStopWordCombos: @@ -140,63 +186,122 @@ def test_create_word_stopword_combo(self): assert actual == expected +class TestCreateFormattingDictionary: + def test_create_formatting_dictionary(self): + test_word_stopword_combos = ["hello my world", "hello world"] + actual = stream.create_formatting_dictionary( + word_stopword_combos=test_word_stopword_combos, + topic_name="Topic 1", + topic_color="#000000", + ) + expected = { + "hello_my_world": "['hello my world', 'Topic 1', '#000000']", + "hello_world": "['hello world', 'Topic 1', '#000000']", + } + assert actual == expected + + +class TestInsertFormattingList: + def test_insert_formatting_list(self): + test_string = "hello my world, how are you this glorious day" + test_replacement_dict = { + "hello_my_world": "['hello my world', 'Topic 1', '#000000']" + } + actual = stream.insert_formatting_list( + string=test_string, + replacement_dict=test_replacement_dict, + word_stopword_combos=["hello my world"], + ) + expected = ( + "['hello my world', 'Topic 1', '#000000']," + + " how are you this glorious day" + ) + assert actual == expected + + +class TestSplitStringOnList: + def test_split_string_on_list(self): + test_string = "hello ['world', 'Topic 1', '#000000'], how are you" + actual = stream.split_string_on_list(test_string) + expected = ["hello", "['world', 'Topic 1', '#000000']", " how are you"] + assert actual == expected + + class TestInsertTuple: def test_insert_tuple(self): - test_split_string = [ - "hello my name", - "['word', 'Topic 1', '#000000']", - "is world", - ] - actual = stream.insert_tuple(split_string=test_split_string) - expected = ["hello my name", ("word", "Topic 1", "#000000"), "is world"] + test_list = ["hello", "['world', 'Topic 1', '#000000']", " how are you"] + actual = stream.insert_tuple(test_list) + expected = ["hello", ("world", "Topic 1", "#000000"), " how are you"] assert actual == expected -class TestAddLabelFormatting: - def test_add_label_formatting(self): - test_df = DataFrame( - {"responses": ["hello world how are you", "my name is world"]} +class TestMultitopicFormatting: + def test_dominant_topics(self): + test_dominant_df = DataFrame( + {"word": ["hello", "world"], "variable": ["topic_1", "topic_2"]} ) - replacement_dict = {"world": "['world', 'Topic 1', '#000000']"} - actual = stream.add_label_formatting( - replacement_dict=replacement_dict, topic_sample=test_df + test_topic_sample = DataFrame( + { + "index": [23, 25, 29], + "responses": [ + "hello world how are you", + "hello my world how are you", + "poppy flowers on sunday in the world", + ], + "topic_1": [0.1, 0.4, 0.8], + "topic_2": [1.0, 0.6, 0.4], + } + ) + actual = stream.multitopic_formatting( + dominant_topics=test_dominant_df, + topic_sample=test_topic_sample, + topic_names=["Topic 1", "Topic 2"], ) expected = [ - ["hello ['world', 'Topic 1', '#000000'] how are you"], - ["my name is ['world', 'Topic 1', '#000000']"], + [ + ("hello", "Topic 1", "#1f77b4"), + " ", + ("world", "Topic 2", "#ff7f0e"), + " ", + "how ", + "are ", + "you ", + ], + [ + ("hello", "Topic 1", "#1f77b4"), + " ", + "my ", + ("world", "Topic 2", "#ff7f0e"), + " ", + "how ", + "are ", + "you ", + ], + [ + "poppy ", + "flowers ", + "on ", + "sunday ", + "in ", + "the ", + ("world", "Topic 2", "#ff7f0e"), + " ", + ], ] assert actual == expected -class TestGetSingleTopicColor: - def test_get_single_topic_color(self): - test_topic_names = ["Topic 1", "Topic 2"] - topic_1 = stream.get_single_topic_color( - topic_names=test_topic_names, topic_name="Topic 1" +class TestCreateFormattingTuple: + def test_create_formatting_tuple(self): + test_dominant_topics = DataFrame( + {"variable": ["topic_1", "topic_2"]}, index=["hello", "world"] ) - topic_2 = stream.get_single_topic_color( - topic_names=test_topic_names, topic_name="Topic 2" + test_topic_color_dict = {"Topic 1": "#000000", "Topic 2": "#999999"} + actual = stream.create_formatting_tuple( + dominant_topics=test_dominant_topics, + word="hello", + topic_color_dict=test_topic_color_dict, ) - assert topic_1 != topic_2 - -# class TestSingleTopicFormatting: -# def test_single_topic_formatting(self): -# top_n_words = Series(["hello", "world"]) -# topic_sample = DataFrame({"responses": ["hello world how are you", -# "hi world you are my oyster", -# "hello my world how are you"], -# "topic_1": [0.9, 0.3, 0.2], -# "topic_2": [0.01, 0.8,0.6]}) -# topic_name = "Topic 1" -# topic_color = "#000000" -# stopwords = prep.initialise_update_stopwords(["he"]) -# actual = stream.single_topic_formatting(top_n_words= top_n_words, -# topic_sample = topic_sample, -# topic_name= "Topic 1", -# topic_color = "#000000", -# stopwords=stopwords) -# expected = [[('hello', 'Topic 1', '#000000'), \ -# ('world', 'Topic 1', '#000000'), 'how are you'], -# ["hi ['world', 'Topic 1', '#000000']", 'you are my oyster']] -# assert actual == expected + expected = ("hello", "Topic 1", "#000000") + assert actual == expected From 08399395dc94169758030591a75d9f7ab1bed4aa Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Wed, 9 Aug 2023 12:03:27 +0100 Subject: [PATCH 3/3] update streamlit app --- streamlit_app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/streamlit_app.py b/streamlit_app.py index 486c78e..ae9f2d4 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -321,7 +321,7 @@ word_stopword_combos = stream.create_word_stopword_combos(top_n_words, stopwords) topic_color = stream.get_single_topic_color(topic_names, topic_name) formatted_topic_single = stream.single_topic_formatting( - top_n_words, topic_sample, topic_name, topic_color, stopwords + top_n_words, topic_sample, topic_name, topic_names, stopwords ) formatted_text = stream.multitopic_formatting( dominant_topics, topic_sample, topic_names