diff --git a/src/modules/streamlit.py b/src/modules/streamlit.py index 4ffeb18..4483592 100644 --- a/src/modules/streamlit.py +++ b/src/modules/streamlit.py @@ -178,6 +178,68 @@ def generate_top_scores(topic_sample: DataFrame, topic_name: str, position: int) return formatted_text_header +def single_topic_formatting( + top_n_words: Series, + topic_sample: DataFrame, + topic_name: str, + topic_names: list, + stopwords: list, +) -> list: + """Creates a streamlit annotate formatting setup for single topic + + Parameters + ---------- + top_n_words:Series + top n number of words with index numbers + topic_sample: DataFrame + sample of responses ordered by a particular topic + topic_name: str + name of the topic + topic_names: list + list of topic names + stopwords:list + list of inconsequential words removed from corpus during cleaning + + Returns + ------- + list + a formatted list of strings and tuples + """ + color = get_single_topic_color(topic_names, topic_name) + reindexed_top_words = reindex_top_words(top_n_words) + word_stopword_combos = create_word_stopword_combos(reindexed_top_words, stopwords) + replacement_dict = create_formatting_dictionary( + word_stopword_combos, topic_name, color + ) + responses = topic_sample["responses"].apply( + lambda x: insert_formatting_list(x, replacement_dict, word_stopword_combos) + ) + split_responses = responses.apply(split_string_on_list) + formatted_responses = split_responses.apply(insert_tuple) + return list(formatted_responses) + + +def get_single_topic_color(topic_names: list, topic_name: str) -> str: + """get the topic color for a single topic + + Parameters + ---------- + topic_names:list + list of topic names + topic_name:str + the topic name to select a color for + + Returns + ------- + str + hex code for the topic color""" + n_topics = len(topic_names) + topic_colors = get_hex_colors(n_topics).as_hex() + topic_number = [n for n, i in enumerate(topic_names) if i == topic_name] + topic_color = topic_colors[topic_number[0]] + return topic_color + + def get_hex_colors(n_colors: int) -> str: """Get the hex color codes for n_colors number of colors @@ -193,29 +255,40 @@ def get_hex_colors(n_colors: int) -> str: return sns.color_palette(n_colors=n_colors).as_hex() -def create_formatting_tuple( - dominant_topics: DataFrame, word: str, topic_color_dict: dict -) -> tuple: - """create a formatting tuple for streamlit annotation +def reindex_top_words(top_n_words: Series) -> Series: + """re-index top n words by the number of words in the phrase and then the + order of importance Parameters ---------- - dominant_topics:DataFrame - dataframe of words and their strongest associated topic - word:str - word to create tuple for - topic_color_dict:dict - dictionary of topics and their assigned colors + top_n_words:Series + the top n number of words within a given topic Returns ------- - tuple - formatting tuple containing word, topic, and color - """ - topic_x = dominant_topics.loc[word, "variable"] - topic_pretty = re.sub("_", " ", topic_x).capitalize() - topic_color = topic_color_dict[topic_pretty] - return (word, topic_pretty, topic_color) + Series + A reordered version of the same series""" + reindexed_top_words = top_n_words.reset_index(drop=True).reset_index() + reindexed_top_words["n_words"] = reindexed_top_words.word.apply(count_words) + sorted_top_words = reindexed_top_words.sort_values( + ["n_words", "index"], ascending=[False, True] + ).word + return sorted_top_words + + +def count_words(phrase: str) -> int: + """Count the number of words in a phrase + + Parameters + ---------- + phrase:str + + Returns + ------- + int + the number of words in the phrase""" + words = phrase.split() + return len(words) def create_word_stopword_combos(top_n_words: Series, stopwords: list) -> list: @@ -245,106 +318,96 @@ def create_word_stopword_combos(top_n_words: Series, stopwords: list) -> list: return unnested_stopword_combo -def insert_tuple(split_string: list) -> list: - """replace string with streamlit annotate formatting tuple +def create_formatting_dictionary( + word_stopword_combos: list, topic_name: str, topic_color: str +) -> dict: + """Create a lookup dictionary to replace words with formatting instructions Parameters ---------- - split_string:list - list of strings which have been split at tuples + word_stopword_combos:list + list of top_n_words with joining stopword combinations + topic_name:str + the name of the topic + topic_color:str + the hex color code for the topic Returns ------- - list - list of strings and formatting tuples - """ - for n, i in enumerate(split_string): - matcher = re.match(r"\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\]", i) - if matcher: - replacement_tuple = tuple( - re.sub(r"\[|\]|'", "", matcher.group(0)).split(", ") - ) - split_string[n] = replacement_tuple - return split_string + dict + a lookup dictionary for formatting replacements""" + keys = word_stopword_combos + values = [f"['{key}', '{topic_name}', '{topic_color}']" for key in keys] + snake_keys = [snake_case(key) for key in keys] + return dict(zip(snake_keys, values)) -def add_label_formatting(replacement_dict: dict, topic_sample: DataFrame) -> list: - """add streamlit annotate label formatting within string +def insert_formatting_list( + string: str, replacement_dict: dict, word_stopword_combos: list +) -> str: + """insert formatting lookup list at match points for dictionary keys Parameters ---------- + string:str + the string to replace values within replacement_dict:dict - dictionary of values to replace with their tuple replacements - topic_sample: DataFrame - sample of responses ordered by a particular topic + lookup dictionary of replacments + word_stopword_combos:list + list of top_n_words with joining stopword combinations Returns ------- - list - list of strings and formatting tuples + str + string with values replaced with values wrapped in formatting """ - formatted_text = [] - for sample in topic_sample["responses"]: - for key, value in replacement_dict.items(): - sample = re.sub(rf"\s\b{key}\b", f" {value}", sample) - formatted_text.append([sample]) - return formatted_text + for word in word_stopword_combos: + string = re.sub(rf"\b{word}\b", snake_case(word), string) + for key, value in replacement_dict.items(): + string = re.sub(rf"(? str: - """get the topic color for a single topic +def split_string_on_list(string: str) -> list: + """split string before and after formatting points Parameters ---------- - topic_names:list - list of topic names - topic_name:str - the topic name to select a color for + string:str + the string to split Returns ------- - str - hex code for the topic color""" - n_topics = len(topic_names) - topic_colors = get_hex_colors(n_topics).as_hex() - topic_number = [n for n, i in enumerate(topic_names) if i == topic_name] - topic_color = topic_colors[topic_number[0]] - return topic_color + list + a list of strings split at formatting points""" + pattern_behind = r"[\s,](?=\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\])" + pattern_ahead = r"(?<='#[a-zA-Z0-9]{6}'])[\s,]" + pattern_combined = "|".join([pattern_behind, pattern_ahead]) + split_string = re.split(pattern_combined, string) + return split_string -def single_topic_formatting( - top_n_words: Series, topic_sample: DataFrame, topic_name: str, topic_color: str -) -> list: - """Creates a streamlit annotate formatting setup for single topic +def insert_tuple(split_string: list) -> list: + """replace string with streamlit annotate formatting tuple Parameters ---------- - top_n_words:Series - top n number of words with index numbers - topic_sample: DataFrame - sample of responses ordered by a particular topic - topic_name: str - name of the topic - topic_color: str - hex code for the topic + split_string:list + list of strings which have been split at tuples Returns ------- list - a formatted list of strings and tuples + list of strings and formatting tuples """ - pattern_behind = r"[\s,](?=\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\])" - pattern_ahead = r"(?<='#[a-zA-Z0-9]{6}'])[\s]" - pattern_combined = "|".join([pattern_behind, pattern_ahead]) - top_n_words_x = top_n_words - replacements = [[i, topic_name, topic_color] for i in list(top_n_words)] - replacement_dict = dict(zip(top_n_words_x, replacements)) - initial_formatted = add_label_formatting(replacement_dict, topic_sample) - for idx in range(len(initial_formatted)): - split_string = re.split(pattern_combined, initial_formatted[idx][0]) - split_string = insert_tuple(split_string) - initial_formatted[idx] = split_string - return initial_formatted + for n, i in enumerate(split_string): + matcher = re.match(r"\['[\w\s]+',\s'\w+\s\d+',\s'#[a-zA-Z0-9]{6}'\]", i) + if matcher: + replacement_tuple = tuple( + re.sub(r"\[|\]|'", "", matcher.group(0)).split(", ") + ) + split_string[n] = replacement_tuple + return split_string def multitopic_formatting( @@ -384,3 +447,28 @@ def multitopic_formatting( formatted_response.append(word + " ") formatted_text.append(formatted_response) return formatted_text + + +def create_formatting_tuple( + dominant_topics: DataFrame, word: str, topic_color_dict: dict +) -> tuple: + """create a formatting tuple for streamlit annotation + + Parameters + ---------- + dominant_topics:DataFrame + dataframe of words and their strongest associated topic + word:str + word to create tuple for + topic_color_dict:dict + dictionary of topics and their assigned colors + + Returns + ------- + tuple + formatting tuple containing word, topic, and color + """ + topic_x = dominant_topics.loc[word, "variable"] + topic_pretty = re.sub("_", " ", topic_x).capitalize() + topic_color = topic_color_dict[topic_pretty] + return (word, topic_pretty, topic_color) diff --git a/streamlit_app.py b/streamlit_app.py index fe8f9cf..ae9f2d4 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -321,7 +321,7 @@ word_stopword_combos = stream.create_word_stopword_combos(top_n_words, stopwords) topic_color = stream.get_single_topic_color(topic_names, topic_name) formatted_topic_single = stream.single_topic_formatting( - word_stopword_combos, topic_sample, topic_name, topic_color + top_n_words, topic_sample, topic_name, topic_names, stopwords ) formatted_text = stream.multitopic_formatting( dominant_topics, topic_sample, topic_names diff --git a/tests/modules/test_streamlit.py b/tests/modules/test_streamlit.py new file mode 100644 index 0000000..5caa498 --- /dev/null +++ b/tests/modules/test_streamlit.py @@ -0,0 +1,307 @@ +import re +from importlib import reload + +from pandas import DataFrame, Series + +# from src.modules import preprocessing as prep +from src.modules import streamlit as stream + +reload(stream) + + +class TestGetNTopWords: + def test_get_n_top_words(self): + + test_df = DataFrame( + { + "topic_1_word_importance": [0, 1, 2], + "topic_2_word_importance": [0, 0, 0], + "word": ["alpha", "bravo", "charlie"], + } + ) + actual = stream.get_top_n_words(topic_words=test_df, n=2, topic_name="Topic 1") + expected = Series(["bravo", "charlie"], index=[1, 2]) + assert all(actual == expected) + + +class TestIdentifyDominantTopics: + def test_identify_dominant_topics(self): + topic_names_snake = ["topic_1", "topic_2", "topic_3"] + test_df = DataFrame( + { + "word": ["alpha", "bravo", "charlie"], + "topic_1": [0, 1, 2], + "topic_2": [2, 3, 4], + "topic_3": [3, 2, 1], + } + ) + actual = stream.identify_dominant_topics( + topic_words=test_df, topic_names_snake=topic_names_snake + ) + expected = DataFrame( + { + "word": ["alpha", "bravo", "charlie"], + "variable": ["topic_3", "topic_2", "topic_2"], + } + ) + assert all(actual == expected) + + +class TestSnakeCase: + def test_snake_case(self): + actual = stream.snake_case("This string") + expected = "this_string" + assert actual == expected + + +class TestGetNTopicSamples: + def test_get_n_topic_samples(self): + test_df = DataFrame( + { + "responses": ["hello word", "world hello", "hello hello"], + "topic_1": [0, 2, 1], + } + ) + actual = stream.get_n_topic_samples( + text_with_topic_df=test_df, topic_name="Topic_1", n=2 + ) + expected = DataFrame( + {"responses": ["world hello", "hello hello"], "topic_1": [2, 1]} + ) + assert all(actual == expected) + + +class TestGetResponseNo: + def test_get_response_no(self): + test_df = DataFrame( + { + "responses": ["hello word", "world hello", "hello hello"], + "index": [455, 12, 11], + } + ) + actual = stream.get_response_no(topic_sample=test_df, position=1) + expected = "Response 12" + assert actual == expected + + +class TestGenerateTopScores: + def test_generate_top_scores(self): + test_df = DataFrame( + { + "responses": ["hello word", "world hello", "hello hello"], + "index": [53, 22, 12], + "topic_1": [0.1, 0.3, 0.01], + "topic_2": [0.12, 0.22, 0.32], + } + ) + actual = stream.generate_top_scores( + topic_sample=test_df, topic_name="Topic 1", position=1 + ) + expected = "(Topic 1; Score: 30.0%) (Topic 2; Score: 22.0%)" + assert actual == expected + + +class TestSingleTopicFormatting: + def test_single_topic_formatting(self): + test_top_words = DataFrame({"word": ["hello world", "happy"]}).word + test_topic_sample = DataFrame( + { + "responses": [ + "hello world how are you", + "world hello how am i", + "I am so happy hello my world", + ], + "index": [53, 22, 12], + "topic_1": [0.1, 0.3, 0.01], + "topic_2": [0.12, 0.22, 0.32], + } + ) + actual = stream.single_topic_formatting( + top_n_words=test_top_words, + topic_sample=test_topic_sample, + topic_name="Topic 1", + topic_names=["Topic 1", "Topic 2"], + stopwords=["my"], + ) + expected = [ + [("hello world", "Topic 1", "#1f77b4"), "how are you"], + ["world hello how am i"], + [ + "I am so", + ("happy", "Topic 1", "#1f77b4"), + ("hello my world", "Topic 1", "#1f77b4"), + ], + ] + assert actual == expected + + +class TestGetSingleTopicColor: + def test_get_single_topic_color(self): + test_topic_names = ["Topic 1", "Topic 2"] + topic_1 = stream.get_single_topic_color( + topic_names=test_topic_names, topic_name="Topic 1" + ) + topic_2 = stream.get_single_topic_color( + topic_names=test_topic_names, topic_name="Topic 2" + ) + assert topic_1 != topic_2 + + +class TestGetHexColors: + def test_get_hex_colors_is_hex(self): + actual = stream.get_hex_colors(n_colors=1) + assert re.match(r"#[a-zA-Z0-9]{6}", actual[0]), "does not match hex pattern" + + def test_get_hex_colors_n_returns(self): + actual = stream.get_hex_colors(n_colors=4) + assert len(actual) == 4 + actual = stream.get_hex_colors(n_colors=2) + assert len(actual) == 2 + + +class TestReindexTopWords: + def test_reindex_top_words(self): + test_top_words = Series(["hoppy", "hello world", "happy"], name="word") + actual = stream.reindex_top_words(test_top_words) + expected = Series( + ["hello world", "hoppy", "happy"], index=[1, 0, 2], name="word" + ) + assert all(actual == expected) + + +class TestCountWords: + def test_count_words(self): + assert stream.count_words("hello world") == 2 + assert stream.count_words("hello") == 1 + + +class TestCreateWordStopWordCombos: + def test_create_word_stopword_combo(self): + test_stopwords = ["he", "her"] + test_words = Series(["hello world", "hello"], index=[21, 42]) + actual = stream.create_word_stopword_combos( + top_n_words=test_words, stopwords=test_stopwords + ) + expected = ["hello he world", "hello her world", "hello world", "hello"] + assert actual == expected + + +class TestCreateFormattingDictionary: + def test_create_formatting_dictionary(self): + test_word_stopword_combos = ["hello my world", "hello world"] + actual = stream.create_formatting_dictionary( + word_stopword_combos=test_word_stopword_combos, + topic_name="Topic 1", + topic_color="#000000", + ) + expected = { + "hello_my_world": "['hello my world', 'Topic 1', '#000000']", + "hello_world": "['hello world', 'Topic 1', '#000000']", + } + assert actual == expected + + +class TestInsertFormattingList: + def test_insert_formatting_list(self): + test_string = "hello my world, how are you this glorious day" + test_replacement_dict = { + "hello_my_world": "['hello my world', 'Topic 1', '#000000']" + } + actual = stream.insert_formatting_list( + string=test_string, + replacement_dict=test_replacement_dict, + word_stopword_combos=["hello my world"], + ) + expected = ( + "['hello my world', 'Topic 1', '#000000']," + + " how are you this glorious day" + ) + assert actual == expected + + +class TestSplitStringOnList: + def test_split_string_on_list(self): + test_string = "hello ['world', 'Topic 1', '#000000'], how are you" + actual = stream.split_string_on_list(test_string) + expected = ["hello", "['world', 'Topic 1', '#000000']", " how are you"] + assert actual == expected + + +class TestInsertTuple: + def test_insert_tuple(self): + test_list = ["hello", "['world', 'Topic 1', '#000000']", " how are you"] + actual = stream.insert_tuple(test_list) + expected = ["hello", ("world", "Topic 1", "#000000"), " how are you"] + assert actual == expected + + +class TestMultitopicFormatting: + def test_dominant_topics(self): + test_dominant_df = DataFrame( + {"word": ["hello", "world"], "variable": ["topic_1", "topic_2"]} + ) + test_topic_sample = DataFrame( + { + "index": [23, 25, 29], + "responses": [ + "hello world how are you", + "hello my world how are you", + "poppy flowers on sunday in the world", + ], + "topic_1": [0.1, 0.4, 0.8], + "topic_2": [1.0, 0.6, 0.4], + } + ) + actual = stream.multitopic_formatting( + dominant_topics=test_dominant_df, + topic_sample=test_topic_sample, + topic_names=["Topic 1", "Topic 2"], + ) + expected = [ + [ + ("hello", "Topic 1", "#1f77b4"), + " ", + ("world", "Topic 2", "#ff7f0e"), + " ", + "how ", + "are ", + "you ", + ], + [ + ("hello", "Topic 1", "#1f77b4"), + " ", + "my ", + ("world", "Topic 2", "#ff7f0e"), + " ", + "how ", + "are ", + "you ", + ], + [ + "poppy ", + "flowers ", + "on ", + "sunday ", + "in ", + "the ", + ("world", "Topic 2", "#ff7f0e"), + " ", + ], + ] + assert actual == expected + + +class TestCreateFormattingTuple: + def test_create_formatting_tuple(self): + test_dominant_topics = DataFrame( + {"variable": ["topic_1", "topic_2"]}, index=["hello", "world"] + ) + test_topic_color_dict = {"Topic 1": "#000000", "Topic 2": "#999999"} + actual = stream.create_formatting_tuple( + dominant_topics=test_dominant_topics, + word="hello", + topic_color_dict=test_topic_color_dict, + ) + + expected = ("hello", "Topic 1", "#000000") + assert actual == expected