From 49b00009de3bed5e1707fca8ffa107fa4c117d0d Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Fri, 25 Oct 2024 12:49:05 -0400 Subject: [PATCH] typing --- docs/conf.py | 158 ++++++++++++++++++++----------------- probablepeople/__init__.py | 37 +++++---- 2 files changed, 104 insertions(+), 91 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 0b6bedc..23f5a8e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # probablepeople documentation build configuration file, created by # sphinx-quickstart on Mon Mar 16 21:43:12 2015. @@ -11,19 +10,16 @@ # # All configuration values have a default; values that are commented out # serve to show the default. - -import sys -import os - +# # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) +# sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom @@ -31,194 +27,197 @@ extensions = [] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'probablepeople' -copyright = u'2015, Cathy Deng, Forest Gregg' +project = "probablepeople" +copyright = "2015, Cathy Deng, Forest Gregg" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '0.3.1' +version = "0.3.1" # The full version, including alpha/beta/rc tags. -release = '0.3.1' +release = "0.3.1" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'default' +html_theme = "default" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'probablepeopledoc' +htmlhelp_basename = "probablepeopledoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # 'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'probablepeople.tex', u'probablepeople Documentation', - u'Cathy Deng, Forest Gregg', 'manual'), + ( + "index", + "probablepeople.tex", + "probablepeople Documentation", + "Cathy Deng, Forest Gregg", + "manual", + ), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- @@ -226,12 +225,17 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'probablepeople', u'probablepeople Documentation', - [u'Cathy Deng, Forest Gregg'], 1) + ( + "index", + "probablepeople", + "probablepeople Documentation", + ["Cathy Deng, Forest Gregg"], + 1, + ) ] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -240,19 +244,25 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'probablepeople', u'probablepeople Documentation', - u'Cathy Deng, Forest Gregg', 'probablepeople', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "probablepeople", + "probablepeople Documentation", + "Cathy Deng, Forest Gregg", + "probablepeople", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False diff --git a/probablepeople/__init__.py b/probablepeople/__init__.py index 78d8418..f5acd4b 100644 --- a/probablepeople/__init__.py +++ b/probablepeople/__init__.py @@ -3,8 +3,8 @@ import os import re import string +import typing import warnings -from collections import OrderedDict import probableparsing import pycrfsuite @@ -13,6 +13,8 @@ from .gender import gender_names from .ratios import ratios +Feature = dict[str, typing.Union[str, bool, "Feature"]] + LABELS = [ "PrefixMarital", "PrefixOther", @@ -53,7 +55,7 @@ PREPOSITIONS = {"for", "to", "of", "on"} -def _loadTagger(model_type): +def _loadTagger(model_type: str) -> pycrfsuite.Tagger: tagger = pycrfsuite.Tagger() try: tagger.open( @@ -74,7 +76,7 @@ def _loadTagger(model_type): TAGGER = _loadTagger("generic") -def parse(raw_string, type=None): +def parse(raw_string: str, type: str | None = None) -> list[tuple[str, str]]: if type is None: type = "generic" tagger = TAGGERS[type] @@ -97,8 +99,8 @@ def parse(raw_string, type=None): return list(zip(tokens, tags)) -def tag(raw_string, type=None): - tagged = OrderedDict() +def tag(raw_string: str, type: str | None = None) -> tuple[dict[str, str], str]: + tagged = {} prev_label = None and_label = False @@ -144,22 +146,23 @@ def tag(raw_string, type=None): prev_label = label + tagged_name = {} for label in tagged: component = " ".join(tagged[label]) component = component.strip(" ,;") - tagged[label] = component + tagged_name[label] = component - if "CorporationName" in tagged or "ShortForm" in tagged: + if "CorporationName" in tagged_name or "ShortForm" in tagged_name: name_type = "Corporation" elif and_label: name_type = "Household" else: name_type = "Person" - return tagged, name_type + return tagged_name, name_type -def tokenize(raw_string): +def tokenize(raw_string: str) -> list[str]: if isinstance(raw_string, bytes): raw_string = raw_string.decode() @@ -183,7 +186,7 @@ def tokenize(raw_string): return tokens -def tokens2features(tokens): +def tokens2features(tokens) -> list[Feature]: feature_sequence = [tokenFeatures(tokens[0])] previous_features = feature_sequence[-1].copy() @@ -209,8 +212,8 @@ def tokens2features(tokens): if len(feature_sequence) > 1: feature_sequence[0]["rawstring.start"] = True feature_sequence[-1]["rawstring.end"] = True - feature_sequence[1]["previous"]["rawstring.start"] = True - feature_sequence[-2]["next"]["rawstring.end"] = True + feature_sequence[1]["previous"]["rawstring.start"] = True # type: ignore [index] + feature_sequence[-2]["next"]["rawstring.end"] = True # type: ignore [index] else: feature_sequence[0]["singleton"] = True @@ -218,7 +221,7 @@ def tokens2features(tokens): return feature_sequence -def tokenFeatures(token): +def tokenFeatures(token: str) -> Feature: if token in ("&"): token_clean = token_abbrev = token @@ -273,16 +276,16 @@ def tokenFeatures(token): return features -def vowelRatio(token): +def vowelRatio(token: str) -> int | bool: n_chars = len(token) if n_chars > 1: n_vowels = sum(token.count(c) for c in VOWELS_Y) - return n_vowels // float(n_chars) + return int(n_vowels // float(n_chars)) else: return False -def digits(token): +def digits(token: str) -> typing.Literal["all_digits", "some_digits", "no_digits"]: if token.isdigit(): return "all_digits" elif set(token) & set(string.digits): @@ -291,7 +294,7 @@ def digits(token): return "no_digits" -def ngrams(word, n=2): +def ngrams(word: str, n: int = 2) -> typing.Generator[str]: return ("".join(letters) for letters in zip(*[word[i:] for i in range(n)]))