Merge branch 'CornellNLP:master' into master

CornellNLP · May 16, 2024 · 435fdbb · 435fdbb
2 parents ac8f0b9 + ee33690
commit 435fdbb
Show file tree

Hide file tree

Showing 14 changed files with 24 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -57,10 +57,10 @@ Available as an interactive notebook: [full version (fine-tuning + inference)](h
 ConvoKit ships with several datasets ready for use "out-of-the-box".
 These datasets can be downloaded using the `convokit.download()` [helper function](https://github.com/CornellNLP/ConvoKit/blob/master/convokit/util.py).  Alternatively you can access them directly [here](http://zissou.infosci.cornell.edu/convokit/datasets/).
 
-### [Conversations Gone Awry Dataset](https://convokit.cornell.edu/documentation/awry.html)
+### [Conversations Gone Awry Datasets](https://convokit.cornell.edu/documentation/awry.html)
 
-Two related corpora of conversations that derail into antisocial behavior. One corpus consists of Wikipedia talk page conversations that derail into personal attacks as labeled by crowdworkers (4,188 conversations containing 30.021 comments). The other consists of discussion threads on the subreddit ChangeMyView (CMV) that derail into rule-violating behavior as determined by the presence of a moderator intervention (6,842 conversations containing 42,964 comments).
-Name for download: `conversations-gone-awry-corpus` (Wikipedia version) or `conversations-gone-awry-cmv-corpus` (Reddit CMV version)
+Two related corpora of conversations that derail into antisocial behavior. One corpus (CGA-WIKI) consists of Wikipedia talk page conversations that derail into personal attacks as labeled by crowdworkers (4,188 conversations containing 30.021 comments). The other (CGA-CMV) consists of discussion threads on the subreddit ChangeMyView (CMV) that derail into rule-violating behavior as determined by the presence of a moderator intervention (6,842 conversations containing 42,964 comments).
+Name for download: `conversations-gone-awry-corpus` (for CGA-WIKI) or `conversations-gone-awry-cmv-corpus` (for CGA-CMV)
 
 ### [Cornell Movie-Dialogs Corpus](https://convokit.cornell.edu/documentation/movie.html)
 

diff --git a/convokit/expected_context_framework/col_normed_tfidf.py b/convokit/expected_context_framework/col_normed_tfidf.py
@@ -115,7 +115,6 @@ def dump(self, dirname):
 
 
 class ColNormedTfidf(TransformerMixin):
-
     """
     Model that derives tf-idf reweighted representations of utterances,
     which are normalized by column. Can be used in ConvoKit through the `ColNormedTfidfTransformer` transformer; see documentation of that transformer for further details.

diff --git a/convokit/hyperconvo/hyperconvo.py b/convokit/hyperconvo/hyperconvo.py
@@ -18,17 +18,17 @@ def degree_stat_funcs(nan_val):
         "norm.max": lambda l: np.max(l) / np.sum(l) if np.sum(l) > 0 else 0,
         "2nd-largest": lambda l: int(np.partition(l, -2)[-2]) if len(l) > 1 else nan_val,
         "2nd-argmax": lambda l: int((-l).argsort()[1]) if len(l) > 1 else nan_val,
-        "norm.2nd-largest": lambda l: np.partition(l, -2)[-2] / np.sum(l)
-        if (len(l) > 1 and np.sum(l) > 0)
-        else nan_val,
+        "norm.2nd-largest": lambda l: (
+            np.partition(l, -2)[-2] / np.sum(l) if (len(l) > 1 and np.sum(l) > 0) else nan_val
+        ),
         "mean": np.mean,
         "mean-nonzero": lambda l: np.mean(l[l != 0]) if len(l[l != 0]) > 0 else 0,
         "prop-nonzero": lambda l: np.mean(l != 0),
         "prop-multiple": lambda l: np.mean(l[l != 0] > 1) if len(l[l != 0] > 1) > 0 else 0,
         "entropy": lambda l: scipy.stats.entropy(l) if np.sum(l) > 0 else nan_val,
-        "2nd-largest / max": lambda l: np.partition(l, -2)[-2] / np.max(l)
-        if (len(l) > 1 and np.sum(l) > 0)
-        else nan_val,
+        "2nd-largest / max": lambda l: (
+            np.partition(l, -2)[-2] / np.max(l) if (len(l) > 1 and np.sum(l) > 0) else nan_val
+        ),
     }
 
 

diff --git a/convokit/model/corpus_helpers.py b/convokit/model/corpus_helpers.py
@@ -577,9 +577,11 @@ def dump_utterances(corpus, dir_name, exclude_vectors, fields_to_skip):
                 KeyMeta: dump_helper_bin(ut.meta, d_bin, fields_to_skip.get("utterance", [])),
                 KeyReplyTo: ut.reply_to,
                 KeyTimestamp: ut.timestamp,
-                KeyVectors: ut.vectors
-                if exclude_vectors is None
-                else list(set(ut.vectors) - set(exclude_vectors)),
+                KeyVectors: (
+                    ut.vectors
+                    if exclude_vectors is None
+                    else list(set(ut.vectors) - set(exclude_vectors))
+                ),
             }
             json.dump(ut_obj, f)
             f.write("\n")

diff --git a/convokit/politeness_collections/politeness_api/features/vectorizer.py b/convokit/politeness_collections/politeness_api/features/vectorizer.py
@@ -37,7 +37,6 @@ def get_unigrams_and_bigrams(document):
 
 
 class PolitenessFeatureVectorizer:
-
     """
     Returns document features based on-
         - unigrams and bigrams

diff --git a/convokit/speakerConvoDiversity/speakerConvoDiversity.py b/convokit/speakerConvoDiversity/speakerConvoDiversity.py
@@ -215,7 +215,6 @@ def compute_speaker_convo_divergence(
 
 
 class SpeakerConvoDiversityWrapper(Transformer):
-
     """
     Implements methodology for calculating linguistic diversity per life-stage. A wrapper around `SpeakerConvoDiversity`.
 

diff --git a/convokit/speakerConvoDiversity/speakerConvoDiversity2.py b/convokit/speakerConvoDiversity/speakerConvoDiversity2.py
@@ -208,7 +208,6 @@ def _set_output(self, corpus, df):
 
 
 class SpeakerConvoDiversityWrapper(Transformer):
-
     """
     Implements methodology for calculating linguistic diversity per life-stage. A wrapper around `SpeakerConvoDiversity`.
 

diff --git a/convokit/speaker_convo_helpers/speaker_convo_attrs.py b/convokit/speaker_convo_helpers/speaker_convo_attrs.py
@@ -3,7 +3,6 @@
 
 
 class SpeakerConvoAttrs(Transformer):
-
     """
     Transformer that aggregates statistics per (speaker, convo). e.g., average wordcount of all utterances that speaker contributed per convo. Assumes that `corpus.organize_speaker_convo_history` has already been called.
 

diff --git a/convokit/speaker_convo_helpers/speaker_convo_lifestage.py b/convokit/speaker_convo_helpers/speaker_convo_lifestage.py
@@ -2,7 +2,6 @@
 
 
 class SpeakerConvoLifestage(Transformer):
-
     """
     Transformer that, for each speaker in a conversation, computes the lifestage of the speaker in that conversation. For instance, if lifestages are 20 conversations long, then the first 20 conversations a speaker participates in will be in lifestage 0, and the second 20 will be in lifestage 1.
 

diff --git a/docs/source/awry.rst b/docs/source/awry.rst
@@ -1,5 +1,5 @@
-Conversations Gone Awry Dataset
-===============================
+Conversations Gone Awry Dataset - Wikipedia version (CGA-WIKI) 
+==============================================================
 
 A collection of conversations from Wikipedia talk pages that derail into personal attacks (4,188 conversations, 30,021 comments).
 

diff --git a/docs/source/awry_cmv.rst b/docs/source/awry_cmv.rst
@@ -1,5 +1,5 @@
-Conversations Gone Awry Dataset [Reddit CMV version]
-====================================================
+Conversations Gone Awry Dataset - Reddit CMV version (CGA-CMV)
+==============================================================
 
 A collection of conversations from the ChangeMyView (CMV) subreddit that derail into personal attacks (6,842 conversations, 42,964 comments). 
 

diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
@@ -2,8 +2,8 @@ Datasets
 ========
 
 .. toctree::
-   Conversations Gone Awry Dataset (Wikipedia version) <awry.rst>
-   Conversations Gone Awry Dataset (Reddit CMV version) <awry_cmv.rst>
+   Conversations Gone Awry Dataset - Wikipedia version (CGA-WIKI) <awry.rst>
+   Conversations Gone Awry Dataset - Reddit CMV version (CGA-CMV) <awry_cmv.rst>
    Cornell Movie-Dialogs Corpus <movie.rst>
    CANDOR Corpus <candor.rst>
    Parliament Question Time Corpus <parliament.rst>

diff --git a/docs/source/wiki.rst b/docs/source/wiki.rst
@@ -66,10 +66,10 @@ Related links
 Data License
 ^^^^^^^^^^^^
 
-This dataset is governed by the `CC BY license v4.0 <https://creativecommons.org/licenses/by/4.0/>`_. Copyright (C) 2017-2020 The ConvoKit Developers.
+This dataset is governed by the `CC BY-SA license v4.0 <https://creativecommons.org/licenses/by-sa/4.0/>`_.
 
 
 Contact
 ^^^^^^^
 
-Please email any questions to: [email protected] (Cristian Danescu-Niculescu-Mizil)
+Please email any questions to: [email protected] (Cristian Danescu-Niculescu-Mizil)
diff --git a/examples/Introduction_to_ConvoKit.ipynb b/examples/Introduction_to_ConvoKit.ipynb
@@ -539,7 +539,7 @@
     "\n",
     "**Metadata** is where you *customize* the Corpus to your use case.\n",
     "\n",
-    "When working with your own metadata, you may we want to add elements specific to your own dataset. The metadata of each object is a dict-like structure that can be customized according to your needs. As such, you may do things like:\n",
+    "When working with your own metadata, you may want to add elements specific to your own dataset. The metadata of each object is a dict-like structure that can be customized according to your needs. As such, you may do things like:\n",
     "\n",
     "- Store the dependency parse of an Utterance\n",
     "- Label a component according to your own categories\n",
@@ -3015,4 +3015,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}