From 511201f8a7a49c1c0c006712daa185acbf1a3958 Mon Sep 17 00:00:00 2001
From: Colin Daglish <colin.daglish@ons.gov.uk>
Date: Wed, 18 Oct 2023 17:55:31 +0100
Subject: [PATCH] Add word counts

---
 src/modules/word_counts.py | 23 +++++++++++++++++++++++
 streamlit_app.py           |  7 +++++++
 2 files changed, 30 insertions(+)
 create mode 100644 src/modules/word_counts.py

diff --git a/src/modules/word_counts.py b/src/modules/word_counts.py
new file mode 100644
index 0000000..5109d9b
--- /dev/null
+++ b/src/modules/word_counts.py
@@ -0,0 +1,23 @@
+import plotly.graph_objects as go
+from matplotlib.figure import Figure
+from pandas import Series
+
+
+def plot_word_counts(data: Series) -> Figure:
+    """Create a word count boxplot
+
+    Parameters
+    ----------
+    data : Series
+        a series of word counts corrosponding to the original data
+
+    Returns
+    -------
+    Figure
+        a boxplot of word counts for the responses
+    """
+    fig = go.Figure()
+    fig.add_trace(go.Box(x=data, name=""))
+    fig.update_layout(title=go.layout.Title(text="Response word counts"), height=300)
+
+    return fig
diff --git a/streamlit_app.py b/streamlit_app.py
index 1b9dc74..8c94870 100644
--- a/streamlit_app.py
+++ b/streamlit_app.py
@@ -1,5 +1,6 @@
 import re
 import warnings
+from collections import Counter
 from datetime import datetime as dt
 
 import numpy as np
@@ -14,6 +15,7 @@
 from src.modules import spell_correct as spell
 from src.modules import streamlit as stream
 from src.modules import topic_modelling as topic
+from src.modules import word_counts
 from src.modules.config import Config
 
 # Page configuration
@@ -210,6 +212,8 @@
     spell_checker = spell.update_spell_dictionary(config["spelling"])
     raw_series = raw_data[question]
     response_char_lengths = prep.get_response_length(raw_series)
+    response_word_counts = pd.Series([len(Counter(x.split())) for x in raw_series])
+    word_counts_fig = word_counts.plot_word_counts(response_word_counts)
     average_response_char_length = response_char_lengths.mean()
     # Cleaning
     no_ans_removed = prep.remove_no_answer(raw_series)
@@ -278,6 +282,9 @@
         st.metric("Question Responses", len(spelling_fixed))
     with a2:
         st.metric("Total Responses", len(raw_data))
+
+    container = st.container()
+    container.plotly_chart(word_counts_fig)
     st.divider()
 
     # Topic Word Dataframe configuration