Skip to content

Commit

Permalink
Merge pull request #20 from datasciencecampus/feature/report
Browse files Browse the repository at this point in the history
add unit tests for streamlit.py
  • Loading branch information
brenng1 authored Sep 1, 2023
2 parents bc2b902 + 0839939 commit 98986e3
Show file tree
Hide file tree
Showing 3 changed files with 479 additions and 84 deletions.
254 changes: 171 additions & 83 deletions src/modules/streamlit.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,68 @@ def generate_top_scores(topic_sample: DataFrame, topic_name: str, position: int)
return formatted_text_header


def single_topic_formatting(
top_n_words: Series,
topic_sample: DataFrame,
topic_name: str,
topic_names: list,
stopwords: list,
) -> list:
"""Creates a streamlit annotate formatting setup for single topic
Parameters
----------
top_n_words:Series
top n number of words with index numbers
topic_sample: DataFrame
sample of responses ordered by a particular topic
topic_name: str
name of the topic
topic_names: list
list of topic names
stopwords:list
list of inconsequential words removed from corpus during cleaning
Returns
-------
list
a formatted list of strings and tuples
"""
color = get_single_topic_color(topic_names, topic_name)
reindexed_top_words = reindex_top_words(top_n_words)
word_stopword_combos = create_word_stopword_combos(reindexed_top_words, stopwords)
replacement_dict = create_formatting_dictionary(
word_stopword_combos, topic_name, color
)
responses = topic_sample["responses"].apply(
lambda x: insert_formatting_list(x, replacement_dict, word_stopword_combos)
)
split_responses = responses.apply(split_string_on_list)
formatted_responses = split_responses.apply(insert_tuple)
return list(formatted_responses)


def get_single_topic_color(topic_names: list, topic_name: str) -> str:
"""get the topic color for a single topic
Parameters
----------
topic_names:list
list of topic names
topic_name:str
the topic name to select a color for
Returns
-------
str
hex code for the topic color"""
n_topics = len(topic_names)
topic_colors = get_hex_colors(n_topics).as_hex()
topic_number = [n for n, i in enumerate(topic_names) if i == topic_name]
topic_color = topic_colors[topic_number[0]]
return topic_color


def get_hex_colors(n_colors: int) -> str:
"""Get the hex color codes for n_colors number of colors
Expand All @@ -193,29 +255,40 @@ def get_hex_colors(n_colors: int) -> str:
return sns.color_palette(n_colors=n_colors).as_hex()


def create_formatting_tuple(
dominant_topics: DataFrame, word: str, topic_color_dict: dict
) -> tuple:
"""create a formatting tuple for streamlit annotation
def reindex_top_words(top_n_words: Series) -> Series:
"""re-index top n words by the number of words in the phrase and then the
order of importance
Parameters
----------
dominant_topics:DataFrame
dataframe of words and their strongest associated topic
word:str
word to create tuple for
topic_color_dict:dict
dictionary of topics and their assigned colors
top_n_words:Series
the top n number of words within a given topic
Returns
-------
tuple
formatting tuple containing word, topic, and color
"""
topic_x = dominant_topics.loc[word, "variable"]
topic_pretty = re.sub("_", " ", topic_x).capitalize()
topic_color = topic_color_dict[topic_pretty]
return (word, topic_pretty, topic_color)
Series
A reordered version of the same series"""
reindexed_top_words = top_n_words.reset_index(drop=True).reset_index()
reindexed_top_words["n_words"] = reindexed_top_words.word.apply(count_words)
sorted_top_words = reindexed_top_words.sort_values(
["n_words", "index"], ascending=[False, True]
).word
return sorted_top_words


def count_words(phrase: str) -> int:
"""Count the number of words in a phrase
Parameters
----------
phrase:str
Returns
-------
int
the number of words in the phrase"""
words = phrase.split()
return len(words)


def create_word_stopword_combos(top_n_words: Series, stopwords: list) -> list:
Expand Down Expand Up @@ -245,106 +318,96 @@ def create_word_stopword_combos(top_n_words: Series, stopwords: list) -> list:
return unnested_stopword_combo


def insert_tuple(split_string: list) -> list:
"""replace string with streamlit annotate formatting tuple
def create_formatting_dictionary(
word_stopword_combos: list, topic_name: str, topic_color: str
) -> dict:
"""Create a lookup dictionary to replace words with formatting instructions
Parameters
----------
split_string:list
list of strings which have been split at tuples
word_stopword_combos:list
list of top_n_words with joining stopword combinations
topic_name:str
the name of the topic
topic_color:str
the hex color code for the topic
Returns
-------
list
list of strings and formatting tuples
"""
for n, i in enumerate(split_string):
matcher = re.match(r"\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\]", i)
if matcher:
replacement_tuple = tuple(
re.sub(r"\[|\]|'", "", matcher.group(0)).split(", ")
)
split_string[n] = replacement_tuple
return split_string
dict
a lookup dictionary for formatting replacements"""
keys = word_stopword_combos
values = [f"['{key}', '{topic_name}', '{topic_color}']" for key in keys]
snake_keys = [snake_case(key) for key in keys]
return dict(zip(snake_keys, values))


def add_label_formatting(replacement_dict: dict, topic_sample: DataFrame) -> list:
"""add streamlit annotate label formatting within string
def insert_formatting_list(
string: str, replacement_dict: dict, word_stopword_combos: list
) -> str:
"""insert formatting lookup list at match points for dictionary keys
Parameters
----------
string:str
the string to replace values within
replacement_dict:dict
dictionary of values to replace with their tuple replacements
topic_sample: DataFrame
sample of responses ordered by a particular topic
lookup dictionary of replacments
word_stopword_combos:list
list of top_n_words with joining stopword combinations
Returns
-------
list
list of strings and formatting tuples
str
string with values replaced with values wrapped in formatting
"""
formatted_text = []
for sample in topic_sample["responses"]:
for key, value in replacement_dict.items():
sample = re.sub(rf"\s\b{key}\b", f" {value}", sample)
formatted_text.append([sample])
return formatted_text
for word in word_stopword_combos:
string = re.sub(rf"\b{word}\b", snake_case(word), string)
for key, value in replacement_dict.items():
string = re.sub(rf"(?<!')\b{key}\b(?!')", value, string)
return string


def get_single_topic_color(topic_names: list, topic_name: str) -> str:
"""get the topic color for a single topic
def split_string_on_list(string: str) -> list:
"""split string before and after formatting points
Parameters
----------
topic_names:list
list of topic names
topic_name:str
the topic name to select a color for
string:str
the string to split
Returns
-------
str
hex code for the topic color"""
n_topics = len(topic_names)
topic_colors = get_hex_colors(n_topics).as_hex()
topic_number = [n for n, i in enumerate(topic_names) if i == topic_name]
topic_color = topic_colors[topic_number[0]]
return topic_color
list
a list of strings split at formatting points"""
pattern_behind = r"[\s,](?=\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\])"
pattern_ahead = r"(?<='#[a-zA-Z0-9]{6}'])[\s,]"
pattern_combined = "|".join([pattern_behind, pattern_ahead])
split_string = re.split(pattern_combined, string)
return split_string


def single_topic_formatting(
top_n_words: Series, topic_sample: DataFrame, topic_name: str, topic_color: str
) -> list:
"""Creates a streamlit annotate formatting setup for single topic
def insert_tuple(split_string: list) -> list:
"""replace string with streamlit annotate formatting tuple
Parameters
----------
top_n_words:Series
top n number of words with index numbers
topic_sample: DataFrame
sample of responses ordered by a particular topic
topic_name: str
name of the topic
topic_color: str
hex code for the topic
split_string:list
list of strings which have been split at tuples
Returns
-------
list
a formatted list of strings and tuples
list of strings and formatting tuples
"""
pattern_behind = r"[\s,](?=\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\])"
pattern_ahead = r"(?<='#[a-zA-Z0-9]{6}'])[\s]"
pattern_combined = "|".join([pattern_behind, pattern_ahead])
top_n_words_x = top_n_words
replacements = [[i, topic_name, topic_color] for i in list(top_n_words)]
replacement_dict = dict(zip(top_n_words_x, replacements))
initial_formatted = add_label_formatting(replacement_dict, topic_sample)
for idx in range(len(initial_formatted)):
split_string = re.split(pattern_combined, initial_formatted[idx][0])
split_string = insert_tuple(split_string)
initial_formatted[idx] = split_string
return initial_formatted
for n, i in enumerate(split_string):
matcher = re.match(r"\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\]", i)
if matcher:
replacement_tuple = tuple(
re.sub(r"\[|\]|'", "", matcher.group(0)).split(", ")
)
split_string[n] = replacement_tuple
return split_string


def multitopic_formatting(
Expand Down Expand Up @@ -384,3 +447,28 @@ def multitopic_formatting(
formatted_response.append(word + " ")
formatted_text.append(formatted_response)
return formatted_text


def create_formatting_tuple(
dominant_topics: DataFrame, word: str, topic_color_dict: dict
) -> tuple:
"""create a formatting tuple for streamlit annotation
Parameters
----------
dominant_topics:DataFrame
dataframe of words and their strongest associated topic
word:str
word to create tuple for
topic_color_dict:dict
dictionary of topics and their assigned colors
Returns
-------
tuple
formatting tuple containing word, topic, and color
"""
topic_x = dominant_topics.loc[word, "variable"]
topic_pretty = re.sub("_", " ", topic_x).capitalize()
topic_color = topic_color_dict[topic_pretty]
return (word, topic_pretty, topic_color)
2 changes: 1 addition & 1 deletion streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@
word_stopword_combos = stream.create_word_stopword_combos(top_n_words, stopwords)
topic_color = stream.get_single_topic_color(topic_names, topic_name)
formatted_topic_single = stream.single_topic_formatting(
word_stopword_combos, topic_sample, topic_name, topic_color
top_n_words, topic_sample, topic_name, topic_names, stopwords
)
formatted_text = stream.multitopic_formatting(
dominant_topics, topic_sample, topic_names
Expand Down
Loading

0 comments on commit 98986e3

Please sign in to comment.