From 49b00009de3bed5e1707fca8ffa107fa4c117d0d Mon Sep 17 00:00:00 2001
From: Forest Gregg <fgregg@datamade.us>
Date: Fri, 25 Oct 2024 12:49:05 -0400
Subject: [PATCH] typing

---
 docs/conf.py               | 158 ++++++++++++++++++++-----------------
 probablepeople/__init__.py |  37 +++++----
 2 files changed, 104 insertions(+), 91 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 0b6bedc..23f5a8e 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 #
 # probablepeople documentation build configuration file, created by
 # sphinx-quickstart on Mon Mar 16 21:43:12 2015.
@@ -11,19 +10,16 @@
 #
 # All configuration values have a default; values that are commented out
 # serve to show the default.
-
-import sys
-import os
-
+#
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
+# sys.path.insert(0, os.path.abspath('.'))
 
 # -- General configuration ------------------------------------------------
 
 # If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+# needs_sphinx = '1.0'
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -31,194 +27,197 @@
 extensions = []
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # The suffix of source filenames.
-source_suffix = '.rst'
+source_suffix = ".rst"
 
 # The encoding of source files.
-#source_encoding = 'utf-8-sig'
+# source_encoding = 'utf-8-sig'
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # General information about the project.
-project = u'probablepeople'
-copyright = u'2015, Cathy Deng, Forest Gregg'
+project = "probablepeople"
+copyright = "2015, Cathy Deng, Forest Gregg"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = '0.3.1'
+version = "0.3.1"
 # The full version, including alpha/beta/rc tags.
-release = '0.3.1'
+release = "0.3.1"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
-#language = None
+# language = None
 
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
-#today = ''
+# today = ''
 # Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+# today_fmt = '%B %d, %Y'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ["_build"]
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
-#default_role = None
+# default_role = None
 
 # If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+# add_function_parentheses = True
 
 # If true, the current module name will be prepended to all description
 # unit titles (such as .. function::).
-#add_module_names = True
+# add_module_names = True
 
 # If true, sectionauthor and moduleauthor directives will be shown in the
 # output. They are ignored by default.
-#show_authors = False
+# show_authors = False
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
+# modindex_common_prefix = []
 
 # If true, keep warnings as "system message" paragraphs in the built documents.
-#keep_warnings = False
+# keep_warnings = False
 
 
 # -- Options for HTML output ----------------------------------------------
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-html_theme = 'default'
+html_theme = "default"
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
-#html_theme_options = {}
+# html_theme_options = {}
 
 # Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
+# html_theme_path = []
 
 # The name for this set of Sphinx documents.  If None, it defaults to
 # "<project> v<release> documentation".
-#html_title = None
+# html_title = None
 
 # A shorter title for the navigation bar.  Default is the same as html_title.
-#html_short_title = None
+# html_short_title = None
 
 # The name of an image file (relative to this directory) to place at the top
 # of the sidebar.
-#html_logo = None
+# html_logo = None
 
 # The name of an image file (within the static path) to use as favicon of the
 # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
 # pixels large.
-#html_favicon = None
+# html_favicon = None
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 
 # Add any extra paths that contain custom files (such as robots.txt or
 # .htaccess) here, relative to this directory. These files are copied
 # directly to the root of the documentation.
-#html_extra_path = []
+# html_extra_path = []
 
 # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
 # using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
+# html_last_updated_fmt = '%b %d, %Y'
 
 # If true, SmartyPants will be used to convert quotes and dashes to
 # typographically correct entities.
-#html_use_smartypants = True
+# html_use_smartypants = True
 
 # Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+# html_sidebars = {}
 
 # Additional templates that should be rendered to pages, maps page names to
 # template names.
-#html_additional_pages = {}
+# html_additional_pages = {}
 
 # If false, no module index is generated.
-#html_domain_indices = True
+# html_domain_indices = True
 
 # If false, no index is generated.
-#html_use_index = True
+# html_use_index = True
 
 # If true, the index is split into individual pages for each letter.
-#html_split_index = False
+# html_split_index = False
 
 # If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
+# html_show_sourcelink = True
 
 # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
+# html_show_sphinx = True
 
 # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
+# html_show_copyright = True
 
 # If true, an OpenSearch description file will be output, and all pages will
 # contain a <link> tag referring to it.  The value of this option must be the
 # base URL from which the finished HTML is served.
-#html_use_opensearch = ''
+# html_use_opensearch = ''
 
 # This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
+# html_file_suffix = None
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'probablepeopledoc'
+htmlhelp_basename = "probablepeopledoc"
 
 
 # -- Options for LaTeX output ---------------------------------------------
 
 latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
-
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
-
-# Additional stuff for the LaTeX preamble.
-#'preamble': '',
+    # The paper size ('letterpaper' or 'a4paper').
+    # 'papersize': 'letterpaper',
+    # The font size ('10pt', '11pt' or '12pt').
+    # 'pointsize': '10pt',
+    # Additional stuff for the LaTeX preamble.
+    # 'preamble': '',
 }
 
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-  ('index', 'probablepeople.tex', u'probablepeople Documentation',
-   u'Cathy Deng, Forest Gregg', 'manual'),
+    (
+        "index",
+        "probablepeople.tex",
+        "probablepeople Documentation",
+        "Cathy Deng, Forest Gregg",
+        "manual",
+    ),
 ]
 
 # The name of an image file (relative to this directory) to place at the top of
 # the title page.
-#latex_logo = None
+# latex_logo = None
 
 # For "manual" documents, if this is true, then toplevel headings are parts,
 # not chapters.
-#latex_use_parts = False
+# latex_use_parts = False
 
 # If true, show page references after internal links.
-#latex_show_pagerefs = False
+# latex_show_pagerefs = False
 
 # If true, show URL addresses after external links.
-#latex_show_urls = False
+# latex_show_urls = False
 
 # Documents to append as an appendix to all manuals.
-#latex_appendices = []
+# latex_appendices = []
 
 # If false, no module index is generated.
-#latex_domain_indices = True
+# latex_domain_indices = True
 
 
 # -- Options for manual page output ---------------------------------------
@@ -226,12 +225,17 @@
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    ('index', 'probablepeople', u'probablepeople Documentation',
-     [u'Cathy Deng, Forest Gregg'], 1)
+    (
+        "index",
+        "probablepeople",
+        "probablepeople Documentation",
+        ["Cathy Deng, Forest Gregg"],
+        1,
+    )
 ]
 
 # If true, show URL addresses after external links.
-#man_show_urls = False
+# man_show_urls = False
 
 
 # -- Options for Texinfo output -------------------------------------------
@@ -240,19 +244,25 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-  ('index', 'probablepeople', u'probablepeople Documentation',
-   u'Cathy Deng, Forest Gregg', 'probablepeople', 'One line description of project.',
-   'Miscellaneous'),
+    (
+        "index",
+        "probablepeople",
+        "probablepeople Documentation",
+        "Cathy Deng, Forest Gregg",
+        "probablepeople",
+        "One line description of project.",
+        "Miscellaneous",
+    ),
 ]
 
 # Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
+# texinfo_appendices = []
 
 # If false, no module index is generated.
-#texinfo_domain_indices = True
+# texinfo_domain_indices = True
 
 # How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
+# texinfo_show_urls = 'footnote'
 
 # If true, do not generate a @detailmenu in the "Top" node's menu.
-#texinfo_no_detailmenu = False
+# texinfo_no_detailmenu = False
diff --git a/probablepeople/__init__.py b/probablepeople/__init__.py
index 78d8418..f5acd4b 100644
--- a/probablepeople/__init__.py
+++ b/probablepeople/__init__.py
@@ -3,8 +3,8 @@
 import os
 import re
 import string
+import typing
 import warnings
-from collections import OrderedDict
 
 import probableparsing
 import pycrfsuite
@@ -13,6 +13,8 @@
 from .gender import gender_names
 from .ratios import ratios
 
+Feature = dict[str, typing.Union[str, bool, "Feature"]]
+
 LABELS = [
     "PrefixMarital",
     "PrefixOther",
@@ -53,7 +55,7 @@
 PREPOSITIONS = {"for", "to", "of", "on"}
 
 
-def _loadTagger(model_type):
+def _loadTagger(model_type: str) -> pycrfsuite.Tagger:
     tagger = pycrfsuite.Tagger()
     try:
         tagger.open(
@@ -74,7 +76,7 @@ def _loadTagger(model_type):
 TAGGER = _loadTagger("generic")
 
 
-def parse(raw_string, type=None):
+def parse(raw_string: str, type: str | None = None) -> list[tuple[str, str]]:
     if type is None:
         type = "generic"
     tagger = TAGGERS[type]
@@ -97,8 +99,8 @@ def parse(raw_string, type=None):
     return list(zip(tokens, tags))
 
 
-def tag(raw_string, type=None):
-    tagged = OrderedDict()
+def tag(raw_string: str, type: str | None = None) -> tuple[dict[str, str], str]:
+    tagged = {}
 
     prev_label = None
     and_label = False
@@ -144,22 +146,23 @@ def tag(raw_string, type=None):
 
         prev_label = label
 
+    tagged_name = {}
     for label in tagged:
         component = " ".join(tagged[label])
         component = component.strip(" ,;")
-        tagged[label] = component
+        tagged_name[label] = component
 
-    if "CorporationName" in tagged or "ShortForm" in tagged:
+    if "CorporationName" in tagged_name or "ShortForm" in tagged_name:
         name_type = "Corporation"
     elif and_label:
         name_type = "Household"
     else:
         name_type = "Person"
 
-    return tagged, name_type
+    return tagged_name, name_type
 
 
-def tokenize(raw_string):
+def tokenize(raw_string: str) -> list[str]:
 
     if isinstance(raw_string, bytes):
         raw_string = raw_string.decode()
@@ -183,7 +186,7 @@ def tokenize(raw_string):
     return tokens
 
 
-def tokens2features(tokens):
+def tokens2features(tokens) -> list[Feature]:
 
     feature_sequence = [tokenFeatures(tokens[0])]
     previous_features = feature_sequence[-1].copy()
@@ -209,8 +212,8 @@ def tokens2features(tokens):
     if len(feature_sequence) > 1:
         feature_sequence[0]["rawstring.start"] = True
         feature_sequence[-1]["rawstring.end"] = True
-        feature_sequence[1]["previous"]["rawstring.start"] = True
-        feature_sequence[-2]["next"]["rawstring.end"] = True
+        feature_sequence[1]["previous"]["rawstring.start"] = True  # type: ignore [index]
+        feature_sequence[-2]["next"]["rawstring.end"] = True  # type: ignore [index]
 
     else:
         feature_sequence[0]["singleton"] = True
@@ -218,7 +221,7 @@ def tokens2features(tokens):
     return feature_sequence
 
 
-def tokenFeatures(token):
+def tokenFeatures(token: str) -> Feature:
 
     if token in ("&"):
         token_clean = token_abbrev = token
@@ -273,16 +276,16 @@ def tokenFeatures(token):
     return features
 
 
-def vowelRatio(token):
+def vowelRatio(token: str) -> int | bool:
     n_chars = len(token)
     if n_chars > 1:
         n_vowels = sum(token.count(c) for c in VOWELS_Y)
-        return n_vowels // float(n_chars)
+        return int(n_vowels // float(n_chars))
     else:
         return False
 
 
-def digits(token):
+def digits(token: str) -> typing.Literal["all_digits", "some_digits", "no_digits"]:
     if token.isdigit():
         return "all_digits"
     elif set(token) & set(string.digits):
@@ -291,7 +294,7 @@ def digits(token):
         return "no_digits"
 
 
-def ngrams(word, n=2):
+def ngrams(word: str, n: int = 2) -> typing.Generator[str]:
     return ("".join(letters) for letters in zip(*[word[i:] for i in range(n)]))