Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Open question transformer #204

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions convokit/open_question/__init.__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .opennessScoreBERT import *
from .opennessScoreSimilarity import *
179 changes: 179 additions & 0 deletions convokit/open_question/opennessScoreBERT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import convokit
from convokit import Corpus, download, FightingWords
from convokit.transformer import Transformer
from inspect import signature
from collections import defaultdict
from itertools import permutations
from nltk.tokenize import word_tokenize
from convokit import Corpus, download
import matplotlib.pyplot as plt
import numpy as np
import random
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
import language_tool_python
import os


class OpennessScoreBERT(Transformer):
"""
A transformer to calculate openness score for all utterance

:param obj_type: type of Corpus object to calculate: 'conversation', 'speaker', or 'utterance', default to be 'utterance'
:param input_field: Input fields from every utterance object. Will default to reading 'utt.text'. If a string is provided, than consider metadata with field name input_field.
:param output_field: field for writing the computed output in metadata. Will default to write to utterance metadata with name 'capitalization'.
:param input_filter: a boolean function of signature `input_filter(utterance, aux_input)`. attributes will only be computed for utterances where `input_filter` returns `True`. By default, will always return `True`, meaning that attributes will be computed for all utterances.
:param verbosity: frequency at which to print status messages when computing attributes.
"""

def __init__(
self,
obj_type="utterance",
output_field="openness_score",
input_field=None,
input_filter=None,
model_name="bert-base-cased",
verbosity=1000,
):
if input_filter:
if len(signature(input_filter).parameters) == 1:
self.input_filter = lambda utt: input_filter(utt)
else:
self.input_filter = input_filter
else:
self.input_filter = lambda utt: True
self.obj_type = obj_type
self.input_field = input_field
self.output_field = output_field
self.verbosity = verbosity
self.grammar_tool = language_tool_python.LanguageToolPublicAPI("en")
self.answer_sample = ["Mhm", "Okay", "I see", "Yup"]
self.model = AutoModelForMaskedLM.from_pretrained(model_name)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)

def _print_output(self, i):
return (self.verbosity > 0) and (i > 0) and (i % self.verbosity == 0)

def bert_score(self, question, answer):
"""
Outputs the perplexitty score for predicting the answer, given the question

:param question: str
:param answer: str
:return: perplexity
"""
sentence = question + " " + answer
tensor_input = self.tokenizer.encode(sentence, return_tensors="pt")
question_tok_len = len(self.tokenizer.encode(question)) - 2
repeat_input = tensor_input.repeat(tensor_input.size(-1) - 2 - question_tok_len, 1)
mask = torch.ones(tensor_input.size(-1) - 1).diag(1)[:-2][question_tok_len:]
masked_input = repeat_input.masked_fill(mask == 1, self.tokenizer.mask_token_id)
labels = repeat_input.masked_fill(masked_input != self.tokenizer.mask_token_id, -100)
with torch.inference_mode():
loss = self.model(masked_input, labels=labels).loss
return np.exp(loss.item())

def find_last_question(self, text):
"""
Finds the last sentence that ended with a question mark

:param text: str
:return: text
"""
end_sent = set([".", "?", "!"])
last_q = text.rfind("?")
for i in range(last_q - 1, -1, -1):
if text[i] in end_sent:
return text[i + 1 : last_q + 1].strip()
return text[: last_q + 1].strip()

def bert_opennes_score(self, question):
scores = []
question = self.find_last_question(question)
question = self.grammar_tool.correct(question)

for ans in self.answer_sample:
ans_text = ans
perp = self.bert_score(question, ans_text)
scores.append(perp)
return np.mean(scores)

def transform(self, corpus: Corpus) -> Corpus:
"""
Score the given utterance on their openness and store it to the corresponding object metadata fields.

:param corpus: Corpus
:return: the corpus
"""
if self.obj_type == "utterance":
total = len(list(corpus.iter_utterances()))

for idx, utterance in enumerate(corpus.iter_utterances()):
if self._print_output(idx):
print(f"%03d/%03d {self.obj_type} processed" % (idx, total))

if not self.input_filter(utterance):
continue

if self.input_field is None:
text_entry = utterance.text
elif isinstance(self.input_field, str):
text_entry = utterance.meta(self.input_field)
if text_entry is None:
continue

# do the catching and add to output_field
catch = self.bert_opennes_score(text_entry)

utterance.add_meta(self.output_field, catch)

elif self.obj_type == "conversation":
total = len(list(corpus.iter_conversations()))
for idx, convo in enumerate(corpus.iter_conversations()):
if self._print_output(idx):
print(f"%03d/%03d {self.obj_type} processed" % (idx, total))

if not self.input_filter(convo):
continue

if self.input_field is None:
utt_lst = convo.get_utterance_ids()
text_entry = " ".join([corpus.get_utterance(x).text for x in utt_lst])
elif isinstance(self.input_field, str):
text_entry = convo.meta(self.input_field)
if text_entry is None:
continue

# do the catching and add to output_field
catch = self.bert_opennes_score(text_entry)

convo.add_meta(self.output_field, catch)

elif self.obj_type == "speaker":
total = len(list(corpus.iter_speakers()))
for idx, sp in enumerate(corpus.iter_speakers()):
if self._print_output(idx):
print(f"%03d/%03d {self.obj_type} processed" % (idx, total))

if not self.input_filter(sp):
continue

if self.input_field is None:
utt_lst = sp.get_utterance_ids()
text_entry = " ".join([corpus.get_utterance(x).text for x in utt_lst])
elif isinstance(self.input_field, str):
text_entry = sp.meta(self.input_field)
if text_entry is None:
continue

# do the catching and add to output_field
catch = self.bert_opennes_score(text_entry)

sp.add_meta(self.output_field, catch)

else:
raise KeyError("obj_type must be utterance, conversation, or speaker")

if self.verbosity > 0:
print(f"%03d/%03d {self.obj_type} processed" % (total, total))
return corpus
199 changes: 199 additions & 0 deletions convokit/open_question/opennessScoreSimilarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
from convokit import Corpus, download, FightingWords
from convokit.transformer import Transformer
from inspect import signature
from collections import defaultdict
from itertools import permutations
from nltk.tokenize import word_tokenize
from convokit import Corpus, download
import matplotlib.pyplot as plt
import numpy as np
import random
from sentence_transformers import SentenceTransformer, util
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import BM25Retriever
from haystack.pipelines import DocumentSearchPipeline
import language_tool_python


class OpennessScoreSimilarity(Transformer):
"""
A transformer that uses BERT similarity to calculate openness score

:param obj_type: type of Corpus object to calculate: 'conversation', 'speaker', or 'utterance', default to be 'utterance'
:param input_field: Input fields from every utterance object. Will default to reading 'utt.text'. If a string is provided, than consider metadata with field name input_field.
:param output_field: field for writing the computed output in metadata. Will default to write to utterance metadata with name 'capitalization'.
:param input_filter: a boolean function of signature `input_filter(utterance, aux_input)`. attributes will only be computed for utterances where `input_filter` returns `True`. By default, will always return `True`, meaning that attributes will be computed for all utterances.
:param verbosity: frequency at which to print status messages when computing attributes.
"""

def __init__(
self,
obj_type="utterance",
output_field="openness_score",
input_field=None,
input_filter=None,
model_name="bert-base-cased",
verbosity=1000,
):
if input_filter:
if len(signature(input_filter).parameters) == 1:
self.input_filter = lambda utt: input_filter(utt)
else:
self.input_filter = input_filter
else:
self.input_filter = lambda utt: True
self.obj_type = obj_type
self.input_field = input_field
self.output_field = output_field
self.verbosity = verbosity
self.grammar_tool = language_tool_python.LanguageToolPublicAPI("en")
self.answer_sample = ["Mhm", "Okay", "I see", "Yup"]
self.document_store = InMemoryDocumentStore(use_bm25=True)
self.model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1", device="cpu") # SBERT model

def fit(self, corpus: Corpus, y=None):
"""Learn context information for the given corpus."""
self.corpus = corpus
self._load_questions(corpus)

def _print_output(self, i):
return (self.verbosity > 0) and (i > 0) and (i % self.verbosity == 0)

def generated_openness_score_similarity(self, text):
if len(text) > 500 and len(word_tokenize(text)) > 100:
text_token = word_tokenize(self._find_last_question(text))
text = ""
for token in text:
text = text + " " + token
prediction = self.pipe.run(query=text, params={"Retriever": {"top_k": 10}})
answers = [prediction["documents"][i].meta["answer"] for i in range(10)]
return self._avg_bert_sim(answers)

def transform(self, corpus: Corpus) -> Corpus:
"""
Score the given utterance on their openness and store it to the corresponding object metadata fields.

:param corpus: Corpus
:return: the corpus
"""
if self.obj_type == "utterance":
total = len(list(corpus.iter_utterances()))

for idx, utterance in enumerate(corpus.iter_utterances()):
if self._print_output(idx):
print(f"%03d/%03d {self.obj_type} processed" % (idx, total))

if not self.input_filter(utterance):
continue

if self.input_field is None:
text_entry = utterance.text
elif isinstance(self.input_field, str):
text_entry = utterance.meta(self.input_field)
if text_entry is None:
continue

# do the catching and add to output_field
catch = self.generated_openness_score_similarity(text_entry)

utterance.add_meta(self.output_field, catch)

elif self.obj_type == "conversation":
total = len(list(corpus.iter_conversations()))
for idx, convo in enumerate(corpus.iter_conversations()):
if self._print_output(idx):
print(f"%03d/%03d {self.obj_type} processed" % (idx, total))

if not self.input_filter(convo):
continue

if self.input_field is None:
utt_lst = convo.get_utterance_ids()
text_entry = " ".join([corpus.get_utterance(x).text for x in utt_lst])
elif isinstance(self.input_field, str):
text_entry = convo.meta(self.input_field)
if text_entry is None:
continue

# do the catching and add to output_field
catch = self.generated_openness_score_similarity(text_entry)

convo.add_meta(self.output_field, catch)

elif self.obj_type == "speaker":
total = len(list(corpus.iter_speakers()))
for idx, sp in enumerate(corpus.iter_speakers()):
if self._print_output(idx):
print(f"%03d/%03d {self.obj_type} processed" % (idx, total))

if not self.input_filter(sp):
continue

if self.input_field is None:
utt_lst = sp.get_utterance_ids()
text_entry = " ".join([corpus.get_utterance(x).text for x in utt_lst])
elif isinstance(self.input_field, str):
text_entry = sp.meta(self.input_field)
if text_entry is None:
continue

# do the catching and add to output_field
catch = self.generated_openness_score_similarity(text_entry)

sp.add_meta(self.output_field, catch)

else:
raise KeyError("obj_type must be utterance, conversation, or speaker")

if self.verbosity > 0:
print(f"%03d/%03d {self.obj_type} processed" % (total, total))
return corpus

# helper function
def _load_questions(self, corpus):
""""""
docs = []
convo_ids = corpus.get_conversation_ids()
for idx in convo_ids:
convo = corpus.get_conversation(idx)
utts = convo.get_chronological_utterance_list()
had_question = False
before_text = ""
for utt in utts:
if had_question:
dic_transf = {
"content": before_text,
"meta": {"convo_id": idx, "answer": utt.text},
}
docs.append(dic_transf)
had_question = False
if utt.meta["questions"] > 0:
had_question = True
before_text = utt.text
self.document_store.write_documents(docs)
self.retriever = BM25Retriever(document_store=self.document_store)
self.pipe = DocumentSearchPipeline(retriever=self.retriever)

def _sbert_embedd_sim(self, embedding1, embedding2):
return float(util.cos_sim(embedding1, embedding2))

def _avg_bert_sim(self, texts):
embeddings = []
for text in texts:
embeddings.append(self.model.encode(text))

scores = []
for i, embedding1 in enumerate(embeddings):
for j, embedding2 in enumerate(embeddings):
if i >= j:
continue
scores.append(self._sbert_embedd_sim(embedding1, embedding2))
return np.mean(scores)

def _find_last_question(self, text):
end_sent = set([".", "?", "!"])
last_q = text.rfind("?")
for i in range(last_q - 1, -1, -1):
if text[i] in end_sent:
return text[i + 1 : last_q + 1].strip()
return text[: last_q + 1].strip()
Loading