pre-commit run --all

soxofaan · Jul 6, 2024 · 8cd37f8 · 8cd37f8
1 parent 545b300
commit 8cd37f8
Show file tree

Hide file tree

Showing 7 changed files with 48 additions and 39 deletions.
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+SOFTWARE.
diff --git a/tests/test_codecs.py b/tests/test_codecs.py
@@ -4,8 +4,8 @@
 from dahuffman.codecs import load, load_shakespeare, load_shakespeare_lower, load_xml
 
 LOREM_IPSUM = """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, 
-    sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. 
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit,
+    sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
     Dolor sed viverra ipsum nunc aliquet bibendum enim. In massa tempor
     nec feugiat. Nunc aliquet bibendum enim facilisis gravida.
 """

diff --git a/train/json-data.py b/train/json-data.py
@@ -4,7 +4,7 @@
 from collections import Counter
 
 from dahuffman import HuffmanCodec
-from train.train_utils import download, CODECS
+from train.train_utils import CODECS, download
 
 _log = logging.getLogger()
 
@@ -34,30 +34,34 @@ def main():
         "https://data.mo.gov/api/views/vpge-tj3s/rows.json",
     ]
 
-    _log.info('Building frequency tables')
+    _log.info("Building frequency tables")
     frequencies_raw = Counter()
     frequencies_compact = Counter()
     for url in urls:
-        path = download(url, 'json-data/' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.json')
-        with path.open('r') as f:
+        path = download(
+            url, "json-data/" + hashlib.md5(url.encode("utf-8")).hexdigest() + ".json"
+        )
+        with path.open("r") as f:
             raw = f.read()
         # Only take first N bytes.
         # Large files probably have a lot of structural repetition, which skews the frequencies
         frequencies_raw.update(raw[:100000])
 
         # Parse and re-encode to compact JSON
-        compact = json.dumps(json.loads(raw), separators=(',', ':'))
+        compact = json.dumps(json.loads(raw), separators=(",", ":"))
         frequencies_compact.update(compact[:100000])
 
     # TODO add more metadata
-    _log.info(f'Frequencies raw {len(frequencies_raw)}: {frequencies_raw}')
+    _log.info(f"Frequencies raw {len(frequencies_raw)}: {frequencies_raw}")
     codec = HuffmanCodec.from_frequencies(frequencies_raw)
     codec.save(CODECS / "json.pickle", metadata={"frequencies": frequencies_raw})
 
-    _log.info(f'Frequencies compact {len(frequencies_compact)}: {frequencies_compact}')
+    _log.info(f"Frequencies compact {len(frequencies_compact)}: {frequencies_compact}")
     codec = HuffmanCodec.from_frequencies(frequencies_compact)
-    codec.save(CODECS / "json-compact.pickle", metadata={"frequencies": frequencies_compact})
+    codec.save(
+        CODECS / "json-compact.pickle", metadata={"frequencies": frequencies_compact}
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/train/requirements.txt b/train/requirements.txt
@@ -1 +1 @@
-requests
+requests
diff --git a/train/shakespeare.py b/train/shakespeare.py
@@ -3,47 +3,50 @@
 Build codecs based on Shakespeare' work
 
 """
+
 import logging
 import re
 from collections import Counter
 
 from dahuffman import HuffmanCodec
-from train.train_utils import download, CODECS
+from train.train_utils import CODECS, download
 
 _log = logging.getLogger()
 
 
 def main():
     logging.basicConfig(level=logging.INFO)
     # Shakespeare Complete Work from Project Gutenberg
-    url = 'http://www.gutenberg.org/files/100/100-0.txt'
-    path = download(url, 'shakespeare.txt')
+    url = "http://www.gutenberg.org/files/100/100-0.txt"
+    path = download(url, "shakespeare.txt")
 
-    with path.open('r', encoding='utf-8-sig') as f:
+    with path.open("r", encoding="utf-8-sig") as f:
         raw = f.read()
 
-    _log.info('Building codec from raw data')
+    _log.info("Building codec from raw data")
     frequencies = Counter(raw)
-    _log.info(f'Frequencies {len(frequencies)}: {frequencies}')
+    _log.info(f"Frequencies {len(frequencies)}: {frequencies}")
     codec = HuffmanCodec.from_frequencies(frequencies)
     codec.save(CODECS / "shakespeare-raw.pickle", metadata={"frequencies": frequencies})
 
-    _log.info('Doing white space clean up')
+    _log.info("Doing white space clean up")
     clean = raw
-    clean = re.sub(r'\s*\n+\s*', '\n', clean)
-    clean = re.sub(r' +', ' ', clean)
+    clean = re.sub(r"\s*\n+\s*", "\n", clean)
+    clean = re.sub(r" +", " ", clean)
     frequencies = Counter(clean)
-    _log.info(f'Frequencies {len(frequencies)}: {frequencies}')
+    _log.info(f"Frequencies {len(frequencies)}: {frequencies}")
     codec = HuffmanCodec.from_frequencies(frequencies)
     codec.save(CODECS / "shakespeare.pickle", metadata={"frequencies": frequencies})
 
-    _log.info('Only handling lower case')
+    _log.info("Only handling lower case")
     lower = clean.lower()
     frequencies = Counter(lower)
-    _log.info(f'Frequencies {len(frequencies)}: {frequencies}')
+    _log.info(f"Frequencies {len(frequencies)}: {frequencies}")
     codec = HuffmanCodec.from_frequencies(frequencies)
-    codec.save(CODECS / "shakespeare-lower.pickle", metadata={"frequencies": frequencies})
+    codec.save(
+        CODECS / "shakespeare-lower.pickle", metadata={"frequencies": frequencies}
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/train/train_utils.py b/train/train_utils.py
@@ -5,8 +5,8 @@
 
 from dahuffman.huffmancodec import ensure_dir
 
-DOWNLOADS = Path(__file__).parent / 'data'
-CODECS = Path(__file__).parent / 'codecs'
+DOWNLOADS = Path(__file__).parent / "data"
+CODECS = Path(__file__).parent / "codecs"
 
 _log = logging.getLogger()
 
@@ -15,13 +15,13 @@ def download(url: str, path: str) -> Path:
     path = DOWNLOADS / path
     if not path.exists():
         ensure_dir(path.parent)
-        _log.info(f'Downloading {url}')
+        _log.info(f"Downloading {url}")
         with requests.get(url) as r:
             r.raise_for_status()
-            with path.open('wb') as f:
+            with path.open("wb") as f:
                 for chunk in r.iter_content(chunk_size=1024):
                     f.write(chunk)
-        _log.info(f'Downloaded to {path}')
+        _log.info(f"Downloaded to {path}")
     else:
-        _log.info(f'{path} already exists. (Not downloading from {url} again)')
+        _log.info(f"{path} already exists. (Not downloading from {url} again)")
     return path
diff --git a/train/xml-data.py b/train/xml-data.py
@@ -3,7 +3,7 @@
 from collections import Counter
 
 from dahuffman import HuffmanCodec
-from train.train_utils import download, CODECS
+from train.train_utils import CODECS, download
 
 _log = logging.getLogger()
 
@@ -34,21 +34,23 @@ def main():
         "https://data.mo.gov/api/views/vpge-tj3s/rows.xml",
     ]
 
-    _log.info('Building frequency tables')
+    _log.info("Building frequency tables")
     frequencies = Counter()
     for url in urls:
-        path = download(url, 'xml-data/' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.xml')
-        with path.open('r') as f:
+        path = download(
+            url, "xml-data/" + hashlib.md5(url.encode("utf-8")).hexdigest() + ".xml"
+        )
+        with path.open("r") as f:
             # Only take first N bytes.
             # Large files probably have a lot of structural repetition, which skews the frequencies
             raw = f.read(100000)
         frequencies.update(raw)
 
     # TODO add more metadata
-    _log.info(f'Frequencies raw {len(frequencies)}: {frequencies}')
+    _log.info(f"Frequencies raw {len(frequencies)}: {frequencies}")
     codec = HuffmanCodec.from_frequencies(frequencies)
     codec.save(CODECS / "xml.pickle", metadata={"frequencies": frequencies})
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()