-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyze_zoekplaatje.py
89 lines (73 loc) · 3.12 KB
/
analyze_zoekplaatje.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
"""
Process a questions file and zoekplaatje csv and output various statistics
"""
import json
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from urllib.parse import unquote
questions_file = "data/questions.csv"
zp_file = "C:/Users/shagen/surfdrive/UvA/work/2024_bing-content-moderation/data/zoekplaatje-export-google.com-2024-11-13T160209.csv"
df_q = pd.read_csv(questions_file)
df_zp = pd.read_csv(zp_file)
print(f"Loaded in {len(df_q)} questions")
# print(df_q.info())
# print(df_zp.info())
# DROP UNKNOWN ELEMENTS FROM ZOEKPLAATJE RESULTS
# Remove unknowns; these are, upon closer inspection invisible elements or highly irregular ones.
original_len = len(df_zp)
print("Dropping unknown elements from zoekplaatje list")
df_in = df_zp[~df_zp["type"].str.contains("unknown")]
print(f"Dropped {original_len - len(df_in)} Zoekplaatje rows from {len(df_in)} rows")
# ADD ZOEKPLAATJE ELEMENTS TO QUESTIONS DF
# list of tuples with (element, section) values
di_zp_elements = {}
df_q.index = df_q["question_simplified_contextualized"].str.lower()
df_q["all_elements"] = [[] for n in range(len(df_q))]
df_q["only_snippets"] = [[] for n in range(len(df_q))]
df_q["board"] = [''] * len(df_q)
count_cols = ["4chanpol_count","4chanint_count","4chanlgbt_count","4chanb_count","4chank_count","4chanfit_count","leftypol_count"]
for i, row in df_zp.iterrows():
q_clean = unquote(row["query"])
df_q.loc[q_clean, "all_elements"].append((row["type"], row["section"]))
# Also add a column with non-organic results
# We see this as the "enrichment"
if "organic" not in row["type"]:
df_q.loc[q_clean, "only_snippets"].append((row["type"], row["section"]))
for i, row in df_q.iterrows():
for count_col in count_cols:
if row[count_col] > 0:
df_q.loc[i, "board"] = count_col.replace("_count", "")
old_len = len(df_q)
#df_with_snips = df_q[df_q["all_elements"].map(len) > 0]
df_with_snips = df_q[df_q["only_snippets"].map(len) > 0]
print(f"Removed {old_len - len(df_with_snips)} rows without snippets, kept {len(df_with_snips)}")
# SUBQUESTION 1: Is there a correlation between question toxicity and the amount of snippets shown?
# Let's visualise this as a scatter plot!
# (Or a matrix?)
cmap = {
"4chanpol": "brown",
"4chanint": "blue",
"4chanlgbt": "pink",
"4chanb": "green",
"4chank": "purple",
"4chanfit": "yellow",
"leftypol": "orange"
}
colors = [cmap[b] for b in df_with_snips["board"]]
x_values = df_with_snips["TOXICITY"]
y_values = df_with_snips["only_snippets"].map(len).tolist()
plt.clf()
plt.title("Do toxic questions get more or less enriched by Google?")
plt.xlabel("Perspective API toxicity score")
plt.ylabel("Number of non-organic SERP snippets")
plt.grid(alpha=.2)
legend_labels = [mpatches.Patch(color=v, label=k) for k, v in cmap.items()]
plt.scatter(x_values, y_values, s=df_with_snips["replies"], c=colors, alpha=0.4)
plt.legend(handles=legend_labels, loc="upper right", title="boards")
plt.show()
# Step 1:
# SUBQUESTION 2: Are different topics differently enriched?
# Let's visualise this as box plots per board, with the boxes denoting the mean + deviation of the amount of
# non-organic snippets
# Step 1: