Skip to content

Commit

Permalink
Add gui, improve script utilization calls
Browse files Browse the repository at this point in the history
  • Loading branch information
gabrielandrade2 committed Sep 7, 2023
1 parent 6745710 commit aa4535d
Show file tree
Hide file tree
Showing 6 changed files with 250 additions and 110 deletions.
3 changes: 2 additions & 1 deletion .idea/AnonymizationTool.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

189 changes: 189 additions & 0 deletions gui.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import tkinter as tk
from tkinter import filedialog
from main import main
from tktooltip import ToolTip


class ListboxEditable(tk.Frame):

def __init__(self, root: tk.Tk, selectmode=tk.MULTIPLE, height=5, width=20, borderwidth=1, relief=tk.SOLID):
super().__init__(root)

self.root = root

self.sb = tk.Scrollbar(self)
self.sb.pack(side=tk.RIGHT, fill=tk.Y)
self.E1 = tk.Entry(self)
self.E1.pack()

self.list = []

self.v = tk.StringVar(value=self.list)
self.b1 = tk.Listbox(self, activestyle='dotbox', yscrollcommand=self.sb.set, listvariable=self.v,
selectmode=selectmode, height=height, width=width, borderwidth=borderwidth, relief=relief)

self.sb.config(command=self.b1.yview)
self.b1.pack()

self.b1.bind('<Delete>', self.remove_item)
self.E1.bind('<Return>', self.set_item)

def set_entry_text(self, text):
self.E1.delete(0, tk.END)
self.E1.insert(0, text)

def set_item(self, event):
text = self.E1.get()
if not text:
return
self.E1.delete(0, tk.END)
self.list.append(text)

self.v.set(self.list)

def remove_item(self, event):
try:
index = self.b1.curselection()[0]
self.list.pop(index)
self.v.set(self.list)
except IndexError:
pass


def browse_files():
file_paths = filedialog.askopenfilenames(filetypes=[("All Files", "*.*")], title="Select Files and Directories")
for path in file_paths:
file_listbox.insert(tk.END, path)


def remove_selected_files():
selected_indices = file_listbox.curselection()
for index in selected_indices[::-1]:
file_listbox.delete(index)


def browse_output_dir():
output_dir = filedialog.askdirectory(title="Select Output Directory")
output_dir_entry.delete(0, tk.END)
output_dir_entry.insert(tk.END, output_dir)


def anonymize_documents():
results_text.config(state=tk.NORMAL)
results_text.delete(1.0, tk.END)
try:

anonymization_count, files = main(file_listbox.get(0, tk.END),
output_dir_entry.get(),
force_anonymize_columns=force_anonymize_columns_listbox.list,
force_anonymize_tokens=force_anonymize_tokens_listbox.list,
stop_words=stop_words_listbox.list)
print(anonymization_count)
results_text.insert(tk.END, ("{} files processed".format(files)))
results_text.insert(tk.END, "\n\n")
results_text.insert(tk.END, "Entities removed:\n")
for key, value in anonymization_count.items():
results_text.insert(tk.END, ("{}: {}".format(key, value)))
results_text.insert(tk.END, "\n")
except Exception as e:
results_text.insert(tk.END, f"Error: {e}")

results_text.config(state=tk.DISABLED)


def toggle_advanced_options():
if advanced_options_button["text"] == "Advanced Options":
advanced_options_button["text"] = "Hide Options"
advanced_options_frame.grid(row=6, columnspan=2, pady=20)

else:
advanced_options_button["text"] = "Advanced Options"
advanced_options_frame.grid_forget()


if __name__ == '__main__':
# Create the main window
window = tk.Tk()
window.title("Anonymizer Tool")

# Create and configure widgets
file_label = tk.Label(window, text="Selected Files/Directories:")
file_label.grid(row=0, column=0, padx=10, pady=10, sticky="w")

file_listbox = tk.Listbox(window, selectmode=tk.MULTIPLE, height=10, width=50, borderwidth=1, relief=tk.SOLID)
file_listbox.grid(row=1, column=0, padx=10, pady=10)
file_listbox.bind("<Delete>", lambda event: remove_selected_files())

file_listbox_buttons_frame = tk.Frame(window)
file_listbox_buttons_frame.grid(row=1, column=1, padx=10, pady=10)

select_file_button = tk.Button(file_listbox_buttons_frame, text="Select File(s)/Dir(s)", command=browse_files)
select_file_button.grid(row=0, column=0, padx=10, pady=10, sticky="w")

remove_file_button = tk.Button(file_listbox_buttons_frame, text="Remove Selected", command=remove_selected_files)
remove_file_button.grid(row=1, column=0, padx=10, pady=10)

output_dir_label = tk.Label(window, text="Selected Output Directory:")
output_dir_label.grid(row=3, column=0, padx=10, pady=10, sticky="w")

output_dir_entry = tk.Entry(window, width=50, borderwidth=1, relief=tk.SOLID)
output_dir_entry.grid(row=4, column=0, padx=10, pady=10)

select_output_button = tk.Button(window, text="Select Output Directory", command=browse_output_dir)
select_output_button.grid(row=4, column=1, padx=10, pady=10, sticky="w")

# Advanced Options button
advanced_options_button = tk.Button(window, text="Advanced Options", command=toggle_advanced_options)
advanced_options_button.grid(row=5, columnspan=2, pady=20)
advanced_options_frame = tk.Frame(window)

# Labels for advanced options
force_anonymize_columns_label = tk.Label(advanced_options_frame, text="Force Anonymize Columns:")
force_anonymize_tokens_label = tk.Label(advanced_options_frame, text="Force Anonymize Tokens:")
stop_words_label = tk.Label(advanced_options_frame, text="Stop Words:")

# Listboxes for advanced options
force_anonymize_columns_listbox = ListboxEditable(advanced_options_frame, selectmode=tk.MULTIPLE, height=3,
width=20, borderwidth=1,
relief=tk.SOLID)
force_anonymize_tokens_listbox = ListboxEditable(advanced_options_frame, selectmode=tk.MULTIPLE, height=3, width=20,
borderwidth=1,
relief=tk.SOLID)
stop_words_listbox = ListboxEditable(advanced_options_frame, selectmode=tk.MULTIPLE, height=3, width=20,
borderwidth=1,
relief=tk.SOLID)
force_anonymize_columns_label.grid(row=0, column=0, padx=10, pady=10)
force_anonymize_tokens_label.grid(row=0, column=1, padx=10, pady=10)
stop_words_label.grid(row=0, column=2, padx=10, pady=1)
force_anonymize_columns_listbox.grid(row=1, column=0, padx=10, pady=1)
force_anonymize_tokens_listbox.grid(row=1, column=1, padx=10, pady=10)
stop_words_listbox.grid(row=1, column=2, padx=10, pady=10)

# Tooltips for advanced options
force_anonymize_columns_tooltip = ToolTip(force_anonymize_columns_listbox,
"Names of the columns to be forcibly anonymized, regardless of the content type",
delay=0.3)
force_anonymize_tokens_tooltip = ToolTip(force_anonymize_tokens_listbox,
"Special tokens that should always be anonymized, e.g. person names that were not detected",
delay=0.3)
stop_words_tooltip = ToolTip(stop_words_listbox,
'Special words that implicate the previous word should be anonymized, e.g. "病院" or "クリニック"',
delay=0.3)

# Anonymize Documents button
anonymize_button = tk.Button(window, text="Anonymize Documents", command=anonymize_documents)
anonymize_button.grid(row=7, column=0, columnspan=2, pady=20)

# Results text area
results_label = tk.Label(window, text="Results:")
results_label.grid(row=8, column=0, padx=10, pady=10, sticky="w")

results_text = tk.Text(window, height=10, width=50, state=tk.DISABLED, borderwidth=1, relief=tk.SOLID)
results_text.grid(row=9, column=0, padx=10, pady=10, columnspan=2, sticky="nsew")

# Configure row and column weights to make the listbox and results text area expandable
window.grid_rowconfigure(1, weight=1)
window.grid_columnconfigure(0, weight=1)

# Start the main loop
window.mainloop()
86 changes: 54 additions & 32 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
import argparse
import glob
import os.path
import re

import MeCab
from openpyxl import load_workbook
from unidic import unidic

ANONYMIZED_TAG = '[ANON]'

Expand All @@ -16,27 +13,32 @@
'Other': 0,
}
tagger = MeCab.Tagger()
# tagger = MeCab.Tagger('-d "{}"'.format(unidic.DICDIR))
force_anonymize = []
special_tokens = []
# tagger = MeCab.Tagger('-d /opt/homebrew/lib/mecab/dic/ipadic')
# tagger = MeCab.Tagger('-r /dev/null -d /opt/homebrew/lib/mecab/dic/mecab-ipadic-neologd')

force_anonymize_columns = []
force_anonymize_tokens = []
stop_words = []
out_dir = None

def process(file):

def process_file(file):
filename = os.path.split(file)[1]
f = load_workbook(file)
for sheet in f.worksheets:
for column in sheet.iter_cols():
if column[0].value in force_anonymize:
if column[0].value in force_anonymize_columns:
for cell in column[1:]:
cell.value = force_deidentify(str(cell.value))
continue
for cell in column[1:]:
if isinstance(cell.value, str):
print(cell)
cell.value = deidentify(cell.value)

out = os.path.join(out_dir, filename)
if os.path.exists(out):
out.replace(".xlsx", "_2.xlsx")
out = out.replace(".xlsx", "_anon.xlsx")
f.save(out)


Expand All @@ -55,7 +57,7 @@ def should_deidentify(token):
else:
anonymization_count["Other"] += 1
return True
elif token[0] in special_tokens:
elif token[0] in force_anonymize_tokens:
anonymization_count["Special tokens"] += 1
return True
return False
Expand All @@ -66,14 +68,30 @@ def get_mecab_parsing(text):
tagger.parse(text).splitlines()[:-1]]


def deidentify(text):
def deidentify(text: str):
"""
Method that performs the actual anonymization of texts. Can be called directly from other scripts in order to
execute the anonymization logic in a single string.
:param text: The text to be anonymized.
:return: The anonymized text.
"""
parsed = get_mecab_parsing(text)

# Debug print
for token in parsed:
print(token)

anonymized_text = list()
for i, token in enumerate(parsed):
if should_deidentify(token):
text = text.replace(token[0], ANONYMIZED_TAG)
if token[0] in ["病院", "クリニック"]:
text = text.replace(parsed[i - 1][0], ANONYMIZED_TAG)
return text
anonymized_text.append(ANONYMIZED_TAG)
elif token[0] in stop_words:
anonymized_text[-1] = ANONYMIZED_TAG
anonymized_text.append(token[0])
else:
anonymized_text.append(token[0])
return "".join(anonymized_text)


def force_deidentify(text):
Expand All @@ -88,14 +106,22 @@ def process_directory(directory):
return [os.path.join(directory, f) for f in filtered]


def main(input, output, anonymize_columns=[], tokens=[]):
global force_anonymize
global special_tokens
def main(input: str, output: str, force_anonymize_columns: list = None, force_anonymize_tokens: list = None,
stop_words: list = None):
"""
Main function for anonymizing Excel files, called when executing this script directly.
:param input: The input file(s) or directory(ies)
:param output: The output directory
:param force_anonymize_columns: Columns to be forcibly anonymized, regardless of the content type
:param force_anonymize_tokens: Special tokens that should always be anonymized
:param stop_words: Special words that implicate the previous word should be anonymized, e.g. "病院" or "クリニック"
:return:
"""
global out_dir
if anonymize_columns != None:
force_anonymize = anonymize_columns
if tokens != None:
special_tokens = tokens
globals()['force_anonymize_columns'] = force_anonymize_columns
globals()['force_anonymize_tokens'] = force_anonymize_tokens
globals()['stop_words'] = stop_words
out_dir = output

# Parse file list
Expand All @@ -112,30 +138,26 @@ def main(input, output, anonymize_columns=[], tokens=[]):

for file in files:
try:
process(file)
process_file(file)
except Exception as e:
print(f"Error processing {file}: {e}")
continue


# print(deidentify("田中と申します。大阪にすんでいます。"))
return anonymization_count, len(files)



if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Excel file anonymizer tool')
parser.add_argument('--input', type=str, nargs='+', required=True,
help='Input file(s) or directory(ies) path')
parser.add_argument('--output', type=str, required=True, help='Anonymized output folder')
parser.add_argument('--force_anonymize', type=str, nargs='+',
parser.add_argument('--force_anonymize_columns', type=str, nargs='+',
help='Columns to be forcibly anonymized, regardless of the content type')
parser.add_argument('--special_tokens', type=str, nargs='+',
help='Special tokens that should always be anonymized')
parser.add_argument('--special_tokens', type=str, nargs='+',
parser.add_argument('--force_anonymize_tokens', type=str, nargs='+',
help='Special tokens that should always be anonymized')
# parser.add_argument('--replace_file_name', type=srt, help='Replace the file name with the given string')
parser.add_argument('--stop_words', type=str, nargs='+',
help='Special words that implicate the previous word should be anonymized, e.g. "病院" or "クリニック"')

args = parser.parse_args()

main(args.input, args.output, args.force_anonymize, args.special_tokens)
main(args.input, args.output, args.force_anonymize_columns, args.force_anonymize_tokens, args.stop_words)
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
et-xmlfile==1.1.0
mecab==0.996.3
openpyxl==3.1.2
tkinter-tooltip==2.1.0
Loading

0 comments on commit aa4535d

Please sign in to comment.