From 511201f8a7a49c1c0c006712daa185acbf1a3958 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Wed, 18 Oct 2023 17:55:31 +0100 Subject: [PATCH] Add word counts --- src/modules/word_counts.py | 23 +++++++++++++++++++++++ streamlit_app.py | 7 +++++++ 2 files changed, 30 insertions(+) create mode 100644 src/modules/word_counts.py diff --git a/src/modules/word_counts.py b/src/modules/word_counts.py new file mode 100644 index 0000000..5109d9b --- /dev/null +++ b/src/modules/word_counts.py @@ -0,0 +1,23 @@ +import plotly.graph_objects as go +from matplotlib.figure import Figure +from pandas import Series + + +def plot_word_counts(data: Series) -> Figure: + """Create a word count boxplot + + Parameters + ---------- + data : Series + a series of word counts corrosponding to the original data + + Returns + ------- + Figure + a boxplot of word counts for the responses + """ + fig = go.Figure() + fig.add_trace(go.Box(x=data, name="")) + fig.update_layout(title=go.layout.Title(text="Response word counts"), height=300) + + return fig diff --git a/streamlit_app.py b/streamlit_app.py index 1b9dc74..8c94870 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -1,5 +1,6 @@ import re import warnings +from collections import Counter from datetime import datetime as dt import numpy as np @@ -14,6 +15,7 @@ from src.modules import spell_correct as spell from src.modules import streamlit as stream from src.modules import topic_modelling as topic +from src.modules import word_counts from src.modules.config import Config # Page configuration @@ -210,6 +212,8 @@ spell_checker = spell.update_spell_dictionary(config["spelling"]) raw_series = raw_data[question] response_char_lengths = prep.get_response_length(raw_series) + response_word_counts = pd.Series([len(Counter(x.split())) for x in raw_series]) + word_counts_fig = word_counts.plot_word_counts(response_word_counts) average_response_char_length = response_char_lengths.mean() # Cleaning no_ans_removed = prep.remove_no_answer(raw_series) @@ -278,6 +282,9 @@ st.metric("Question Responses", len(spelling_fixed)) with a2: st.metric("Total Responses", len(raw_data)) + + container = st.container() + container.plotly_chart(word_counts_fig) st.divider() # Topic Word Dataframe configuration