Skip to content

Commit

Permalink
add unit tests for streamlit.py
Browse files Browse the repository at this point in the history
  • Loading branch information
ColinDaglish committed Aug 8, 2023
1 parent 9bd2cea commit 5a89c66
Show file tree
Hide file tree
Showing 3 changed files with 231 additions and 7 deletions.
34 changes: 28 additions & 6 deletions src/modules/streamlit.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def add_label_formatting(replacement_dict: dict, topic_sample: DataFrame) -> lis
formatted_text = []
for sample in topic_sample["responses"]:
for key, value in replacement_dict.items():
sample = re.sub(rf"\s\b{key}\b", f" {value}", sample)
sample = re.sub(rf"\b{key}\b", f"{value}", sample)
formatted_text.append([sample])
return formatted_text

Expand All @@ -313,7 +313,11 @@ def get_single_topic_color(topic_names: list, topic_name: str) -> str:


def single_topic_formatting(
top_n_words: Series, topic_sample: DataFrame, topic_name: str, topic_color: str
top_n_words: Series,
topic_sample: DataFrame,
topic_name: str,
topic_color: str,
stopwords: list,
) -> list:
"""Creates a streamlit annotate formatting setup for single topic
Expand All @@ -327,18 +331,20 @@ def single_topic_formatting(
name of the topic
topic_color: str
hex code for the topic
stopwords:list
list of inconsequential words removed from corpus during cleaning
Returns
-------
list
a formatted list of strings and tuples
"""
pattern_behind = r"[\s,](?=\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\])"
pattern_ahead = r"(?<='#[a-zA-Z0-9]{6}'])[\s]"
pattern_ahead = r"(?<='#[a-zA-Z0-9]{6}'])[\s,]"

Check warning on line 343 in src/modules/streamlit.py

View check run for this annotation

Codecov / codecov/patch

src/modules/streamlit.py#L343

Added line #L343 was not covered by tests
pattern_combined = "|".join([pattern_behind, pattern_ahead])
top_n_words_x = top_n_words
replacements = [[i, topic_name, topic_color] for i in list(top_n_words)]
replacement_dict = dict(zip(top_n_words_x, replacements))
word_stopword_combos = create_word_stopword_combos(top_n_words, stopwords)
replacements = [[i, topic_name, topic_color] for i in list(word_stopword_combos)]
replacement_dict = dict(zip(word_stopword_combos, replacements))

Check warning on line 347 in src/modules/streamlit.py

View check run for this annotation

Codecov / codecov/patch

src/modules/streamlit.py#L345-L347

Added lines #L345 - L347 were not covered by tests
initial_formatted = add_label_formatting(replacement_dict, topic_sample)
for idx in range(len(initial_formatted)):
split_string = re.split(pattern_combined, initial_formatted[idx][0])
Expand All @@ -347,6 +353,22 @@ def single_topic_formatting(
return initial_formatted


# Series.reset_index()
# test_data = topic_sample["responses"][1]

# reindexed_top_words = top_n_words.reset_index(drop = True).reset_index()
# reindexed_top_words["n_words"] = reindexed_top_words.word.apply(n_words)
# sorted_top_words = reindexed_top_words.sort_values(
# ["n_words", "index"], ascending = False).word


# for phrase in sorted_top_words:
# test_data = re.sub(phrase, snake_case(phrase), test_data)


# def n_words(phrase):
# words = phrase.split()
# return len(words)
def multitopic_formatting(
dominant_topics: DataFrame, topic_sample: DataFrame, topic_names: list
) -> list:
Expand Down
2 changes: 1 addition & 1 deletion streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@
word_stopword_combos = stream.create_word_stopword_combos(top_n_words, stopwords)
topic_color = stream.get_single_topic_color(topic_names, topic_name)
formatted_topic_single = stream.single_topic_formatting(
word_stopword_combos, topic_sample, topic_name, topic_color
top_n_words, topic_sample, topic_name, topic_color, stopwords
)
formatted_text = stream.multitopic_formatting(
dominant_topics, topic_sample, topic_names
Expand Down
202 changes: 202 additions & 0 deletions tests/modules/test_streamlit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import re
from importlib import reload

from pandas import DataFrame, Series

# from src.modules import preprocessing as prep
from src.modules import streamlit as stream

reload(stream)


class TestGetNTopWords:
def test_get_n_top_words(self):

test_df = DataFrame(
{
"topic_1_word_importance": [0, 1, 2],
"topic_2_word_importance": [0, 0, 0],
"word": ["alpha", "bravo", "charlie"],
}
)
actual = stream.get_top_n_words(topic_words=test_df, n=2, topic_name="Topic 1")
expected = Series(["bravo", "charlie"], index=[1, 2])
assert all(actual == expected)


class TestIdentifyDominantTopics:
def test_identify_dominant_topics(self):
topic_names_snake = ["topic_1", "topic_2", "topic_3"]
test_df = DataFrame(
{
"word": ["alpha", "bravo", "charlie"],
"topic_1": [0, 1, 2],
"topic_2": [2, 3, 4],
"topic_3": [3, 2, 1],
}
)
actual = stream.identify_dominant_topics(
topic_words=test_df, topic_names_snake=topic_names_snake
)
expected = DataFrame(
{
"word": ["alpha", "bravo", "charlie"],
"variable": ["topic_3", "topic_2", "topic_2"],
}
)
assert all(actual == expected)


class TestSnakeCase:
def test_snake_case(self):
actual = stream.snake_case("This string")
expected = "this_string"
assert actual == expected


class TestGetNTopicSamples:
def test_get_n_topic_samples(self):
test_df = DataFrame(
{
"responses": ["hello word", "world hello", "hello hello"],
"topic_1": [0, 2, 1],
}
)
actual = stream.get_n_topic_samples(
text_with_topic_df=test_df, topic_name="Topic_1", n=2
)
expected = DataFrame(
{"responses": ["world hello", "hello hello"], "topic_1": [2, 1]}
)
assert all(actual == expected)


class TestGetResponseNo:
def test_get_response_no(self):
test_df = DataFrame(
{
"responses": ["hello word", "world hello", "hello hello"],
"index": [455, 12, 11],
}
)
actual = stream.get_response_no(topic_sample=test_df, position=1)
expected = "Response 12"
assert actual == expected


class TestGenerateTopScores:
def test_generate_top_scores(self):
test_df = DataFrame(
{
"responses": ["hello word", "world hello", "hello hello"],
"index": [53, 22, 12],
"topic_1": [0.1, 0.3, 0.01],
"topic_2": [0.12, 0.22, 0.32],
}
)
actual = stream.generate_top_scores(
topic_sample=test_df, topic_name="Topic 1", position=1
)
expected = "(Topic 1; Score: 30.0%) (Topic 2; Score: 22.0%)"
assert actual == expected


class TestGetHexColors:
def test_get_hex_colors_is_hex(self):
actual = stream.get_hex_colors(n_colors=1)
assert re.match(r"#[a-zA-Z0-9]{6}", actual[0]), "does not match hex pattern"

def test_get_hex_colors_n_returns(self):
actual = stream.get_hex_colors(n_colors=4)
assert len(actual) == 4
actual = stream.get_hex_colors(n_colors=2)
assert len(actual) == 2


class TestGetFormattingTuple:
def test_get_formatting_tuple(self):
test_dominant_topics = DataFrame(
{"variable": ["topic_1", "topic_2"]}, index=["hello", "world"]
)
test_topic_color_dict = {"Topic 1": "#000000", "Topic 2": "#999999"}
actual = stream.create_formatting_tuple(
dominant_topics=test_dominant_topics,
word="hello",
topic_color_dict=test_topic_color_dict,
)

expected = ("hello", "Topic 1", "#000000")
assert actual == expected


class TestCreateWordStopWordCombos:
def test_create_word_stopword_combo(self):
test_stopwords = ["he", "her"]
test_words = Series(["hello world", "hello"], index=[21, 42])
actual = stream.create_word_stopword_combos(
top_n_words=test_words, stopwords=test_stopwords
)
expected = ["hello he world", "hello her world", "hello world", "hello"]
assert actual == expected


class TestInsertTuple:
def test_insert_tuple(self):
test_split_string = [
"hello my name",
"['word', 'Topic 1', '#000000']",
"is world",
]
actual = stream.insert_tuple(split_string=test_split_string)
expected = ["hello my name", ("word", "Topic 1", "#000000"), "is world"]
assert actual == expected


class TestAddLabelFormatting:
def test_add_label_formatting(self):
test_df = DataFrame(
{"responses": ["hello world how are you", "my name is world"]}
)
replacement_dict = {"world": "['world', 'Topic 1', '#000000']"}
actual = stream.add_label_formatting(
replacement_dict=replacement_dict, topic_sample=test_df
)
expected = [
["hello ['world', 'Topic 1', '#000000'] how are you"],
["my name is ['world', 'Topic 1', '#000000']"],
]
assert actual == expected


class TestGetSingleTopicColor:
def test_get_single_topic_color(self):
test_topic_names = ["Topic 1", "Topic 2"]
topic_1 = stream.get_single_topic_color(
topic_names=test_topic_names, topic_name="Topic 1"
)
topic_2 = stream.get_single_topic_color(
topic_names=test_topic_names, topic_name="Topic 2"
)
assert topic_1 != topic_2


# class TestSingleTopicFormatting:
# def test_single_topic_formatting(self):
# top_n_words = Series(["hello", "world"])
# topic_sample = DataFrame({"responses": ["hello world how are you",
# "hi world you are my oyster",
# "hello my world how are you"],
# "topic_1": [0.9, 0.3, 0.2],
# "topic_2": [0.01, 0.8,0.6]})
# topic_name = "Topic 1"
# topic_color = "#000000"
# stopwords = prep.initialise_update_stopwords(["he"])
# actual = stream.single_topic_formatting(top_n_words= top_n_words,
# topic_sample = topic_sample,
# topic_name= "Topic 1",
# topic_color = "#000000",
# stopwords=stopwords)
# expected = [[('hello', 'Topic 1', '#000000'), \
# ('world', 'Topic 1', '#000000'), 'how are you'],
# ["hi ['world', 'Topic 1', '#000000']", 'you are my oyster']]
# assert actual == expected

0 comments on commit 5a89c66

Please sign in to comment.