Skip to content

Commit

Permalink
Updates
Browse files Browse the repository at this point in the history
  • Loading branch information
cboulanger committed Oct 13, 2023
1 parent fcc678d commit ce8a79d
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 20 deletions.
Binary file modified docs/article-fig-10.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
24 changes: 10 additions & 14 deletions jls-article-figures.ipynb

Large diffs are not rendered by default.

28 changes: 24 additions & 4 deletions jls-corpus-analyses.ipynb

Large diffs are not rendered by default.

22 changes: 20 additions & 2 deletions scripts/occurrence_by_year_scatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,18 @@ def prepare_data(articles_df, regex_list, column='text'):
data = []
regex_list.reverse()
for regex in regex_list:
if type(regex) is tuple:
term, regex = regex
else:
term = regex
for year, year_df in articles_df.groupby('year'):
total_word_count = year_df[column].apply(lambda x: len(x.split())).sum()
count = year_df[column].apply(lambda x: len(re.findall(regex, x))).sum()
data.append({'year': year, 'term': regex, 'count': count, 'total_word_count': total_word_count})
data.append({'year': year,
'term': term,
'count': count,
'regex': regex,
'total_word_count': total_word_count})

aggregated = pd.DataFrame(data)
return aggregated
Expand All @@ -27,7 +35,7 @@ def print_occurences(data_frame, regex_list):
print("\n".join(output_list))


def plot_by_year(data, dep_col='term',
def plot_by_year(data, dep_col='term', col_search_term='regex',
title= None, x_label=None, y_label=None, y_axis_limit=None,
file=None, dpi=300, scale_factor=300, color='darkblue'):
years = data['year'].values
Expand Down Expand Up @@ -56,6 +64,16 @@ def plot_by_year(data, dep_col='term',
# Scatter plot with scaled relative frequencies as size
ax.scatter(years, dep_var, s=scaled_relative_frequencies, color=color, zorder=2)

# Annotate with the search term (right-aligned, half-size of labels, in grey)
# if col_search_term in data.columns:
# annotated_terms = set() # Keep track of terms that have been annotated
# for y, term, search_term in zip(dep_var, data[dep_col], data[col_search_term]):
# if term not in annotated_terms:
# annotated_terms.add(term)
# ax.annotate(search_term, (min(years), y), textcoords="offset points",
# xytext=(-10, 0), ha='right', fontsize='small', color='grey')


# Connect the earliest and last point of each observed variable with a line
for dep_v in set(dep_var):
dep_data = data[(data[dep_col] == dep_v) & (data['count'] > 0)].sort_values('year')
Expand Down

0 comments on commit ce8a79d

Please sign in to comment.