Skip to content

Commit

Permalink
Blacken notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
thatbudakguy committed Jul 17, 2023
1 parent 0a93873 commit 079d5fe
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 76 deletions.
138 changes: 78 additions & 60 deletions notebooks/annotations.ipynb

Large diffs are not rendered by default.

15 changes: 9 additions & 6 deletions notebooks/corpus_curation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@
"from scripts.recipes.spancat import doc_spans_jdsw, span_rels_jdsw, SPAN_LABELS\n",
"\n",
"# set the default font for everything so chinese characters display correctly\n",
"matplotlib.rcParams['font.family'] = \"Heiti TC\"\n",
"matplotlib.rcParams.update({'font.size': 14})\n",
"matplotlib.rcParams[\"font.family\"] = \"Heiti TC\"\n",
"matplotlib.rcParams.update({\"font.size\": 14})\n",
"\n",
"# load the data\n",
"df = pd.read_csv(\"../assets/annotations.csv\")\n",
Expand All @@ -65,16 +65,19 @@
"\n",
"# limit to annotations more than 2 std and less than 10 deviations longer than the mean\n",
"limited_df = df[(df[\"length_std\"] > 2) & (df[\"length_std\"] < 10)]\n",
"print(f\"Found {len(limited_df)} annotations more than 2 and less than 10 std deviations from the mean.\")\n",
"print(\n",
" f\"Found {len(limited_df)} annotations more than 2 and less than 10 std deviations from the mean.\"\n",
")\n",
"\n",
"# plot the distribution of texts the annotations are from vs. the overall distribution\n",
"total_counts = df[\"title\"].value_counts().values\n",
"long_counts = limited_df[\"title\"].value_counts().values\n",
"index = df[\"title\"].value_counts().keys()\n",
"cmp_df = pd.DataFrame({\"total\": total_counts, \"long\": long_counts}, index=index)\n",
"cmp_df.plot.bar(figsize=(20, 10), rot=45, title=\"Distribution of Annotations by Text\", logy=True)\n",
"df = limited_df\n",
"\n"
"cmp_df.plot.bar(\n",
" figsize=(20, 10), rot=45, title=\"Distribution of Annotations by Text\", logy=True\n",
")\n",
"df = limited_df\n"
]
},
{
Expand Down
19 changes: 9 additions & 10 deletions notebooks/ent_detect.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
"import re\n",
"\n",
"# load the annotation data file\n",
"df = pd.read_csv('../assets/annotations.csv')\n",
"df = pd.read_csv(\"../assets/annotations.csv\")\n",
"\n",
"# set the default font for everything so chinese characters display correctly\n",
"matplotlib.rcParams['font.family'] = \"Heiti TC\"\n",
"matplotlib.rcParams.update({'font.size': 14})"
"matplotlib.rcParams[\"font.family\"] = \"Heiti TC\"\n",
"matplotlib.rcParams.update({\"font.size\": 14})\n"
]
},
{
Expand All @@ -30,7 +30,7 @@
"source": [
"# Entity Detection\n",
"\n",
"After preprocessing using heuristics, what are the most common spans that occur just before the markers \"\" and \"\"?"
"After preprocessing using heuristics, what are the most common spans that occur just before the markers \"\" and \"\"?\n"
]
},
{
Expand Down Expand Up @@ -931,17 +931,17 @@
"\n",
"ents = Counter()\n",
"\n",
"for annotation in df['annotation']:\n",
"for annotation in df[\"annotation\"]:\n",
" spans = doc_to_spans(annotation)\n",
" for i, span in enumerate(spans):\n",
" if \"\" in span.text or span.label == \"GRAPHIC\":\n",
" if not spans[i - 1].label:\n",
" print(annotation)\n",
" ents[spans[i-1].text] += 1\n",
" ents[spans[i - 1].text] += 1\n",
"\n",
"patterns = defaultdict(list)\n",
"for pattern in srsly.read_jsonl('../assets/ner_patterns.jsonl'):\n",
" patterns[pattern['label']].append(pattern['pattern'])\n",
"for pattern in srsly.read_jsonl(\"../assets/ner_patterns.jsonl\"):\n",
" patterns[pattern[\"label\"]].append(pattern[\"pattern\"])\n",
"\n",
"nlp = spacy.blank(\"zh\")\n",
"matcher = PhraseMatcher(nlp.vocab)\n",
Expand All @@ -956,8 +956,7 @@
" hits += 1\n",
"\n",
"print(len(ents))\n",
"print(ents.most_common(20))\n",
"\n"
"print(ents.most_common(20))\n"
]
}
],
Expand Down

0 comments on commit 079d5fe

Please sign in to comment.