Skip to content

Commit

Permalink
pre-commit run --all
Browse files Browse the repository at this point in the history
  • Loading branch information
soxofaan committed Jul 6, 2024
1 parent 545b300 commit 8cd37f8
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 39 deletions.
2 changes: 1 addition & 1 deletion LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
SOFTWARE.
4 changes: 2 additions & 2 deletions tests/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from dahuffman.codecs import load, load_shakespeare, load_shakespeare_lower, load_xml

LOREM_IPSUM = """
Lorem ipsum dolor sit amet, consectetur adipiscing elit,
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Lorem ipsum dolor sit amet, consectetur adipiscing elit,
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Dolor sed viverra ipsum nunc aliquet bibendum enim. In massa tempor
nec feugiat. Nunc aliquet bibendum enim facilisis gravida.
"""
Expand Down
22 changes: 13 additions & 9 deletions train/json-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from collections import Counter

from dahuffman import HuffmanCodec
from train.train_utils import download, CODECS
from train.train_utils import CODECS, download

_log = logging.getLogger()

Expand Down Expand Up @@ -34,30 +34,34 @@ def main():
"https://data.mo.gov/api/views/vpge-tj3s/rows.json",
]

_log.info('Building frequency tables')
_log.info("Building frequency tables")
frequencies_raw = Counter()
frequencies_compact = Counter()
for url in urls:
path = download(url, 'json-data/' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.json')
with path.open('r') as f:
path = download(
url, "json-data/" + hashlib.md5(url.encode("utf-8")).hexdigest() + ".json"
)
with path.open("r") as f:
raw = f.read()
# Only take first N bytes.
# Large files probably have a lot of structural repetition, which skews the frequencies
frequencies_raw.update(raw[:100000])

# Parse and re-encode to compact JSON
compact = json.dumps(json.loads(raw), separators=(',', ':'))
compact = json.dumps(json.loads(raw), separators=(",", ":"))
frequencies_compact.update(compact[:100000])

# TODO add more metadata
_log.info(f'Frequencies raw {len(frequencies_raw)}: {frequencies_raw}')
_log.info(f"Frequencies raw {len(frequencies_raw)}: {frequencies_raw}")
codec = HuffmanCodec.from_frequencies(frequencies_raw)
codec.save(CODECS / "json.pickle", metadata={"frequencies": frequencies_raw})

_log.info(f'Frequencies compact {len(frequencies_compact)}: {frequencies_compact}')
_log.info(f"Frequencies compact {len(frequencies_compact)}: {frequencies_compact}")
codec = HuffmanCodec.from_frequencies(frequencies_compact)
codec.save(CODECS / "json-compact.pickle", metadata={"frequencies": frequencies_compact})
codec.save(
CODECS / "json-compact.pickle", metadata={"frequencies": frequencies_compact}
)


if __name__ == '__main__':
if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion train/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
requests
requests
31 changes: 17 additions & 14 deletions train/shakespeare.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,47 +3,50 @@
Build codecs based on Shakespeare' work
"""

import logging
import re
from collections import Counter

from dahuffman import HuffmanCodec
from train.train_utils import download, CODECS
from train.train_utils import CODECS, download

_log = logging.getLogger()


def main():
logging.basicConfig(level=logging.INFO)
# Shakespeare Complete Work from Project Gutenberg
url = 'http://www.gutenberg.org/files/100/100-0.txt'
path = download(url, 'shakespeare.txt')
url = "http://www.gutenberg.org/files/100/100-0.txt"
path = download(url, "shakespeare.txt")

with path.open('r', encoding='utf-8-sig') as f:
with path.open("r", encoding="utf-8-sig") as f:
raw = f.read()

_log.info('Building codec from raw data')
_log.info("Building codec from raw data")
frequencies = Counter(raw)
_log.info(f'Frequencies {len(frequencies)}: {frequencies}')
_log.info(f"Frequencies {len(frequencies)}: {frequencies}")
codec = HuffmanCodec.from_frequencies(frequencies)
codec.save(CODECS / "shakespeare-raw.pickle", metadata={"frequencies": frequencies})

_log.info('Doing white space clean up')
_log.info("Doing white space clean up")
clean = raw
clean = re.sub(r'\s*\n+\s*', '\n', clean)
clean = re.sub(r' +', ' ', clean)
clean = re.sub(r"\s*\n+\s*", "\n", clean)
clean = re.sub(r" +", " ", clean)
frequencies = Counter(clean)
_log.info(f'Frequencies {len(frequencies)}: {frequencies}')
_log.info(f"Frequencies {len(frequencies)}: {frequencies}")
codec = HuffmanCodec.from_frequencies(frequencies)
codec.save(CODECS / "shakespeare.pickle", metadata={"frequencies": frequencies})

_log.info('Only handling lower case')
_log.info("Only handling lower case")
lower = clean.lower()
frequencies = Counter(lower)
_log.info(f'Frequencies {len(frequencies)}: {frequencies}')
_log.info(f"Frequencies {len(frequencies)}: {frequencies}")
codec = HuffmanCodec.from_frequencies(frequencies)
codec.save(CODECS / "shakespeare-lower.pickle", metadata={"frequencies": frequencies})
codec.save(
CODECS / "shakespeare-lower.pickle", metadata={"frequencies": frequencies}
)


if __name__ == '__main__':
if __name__ == "__main__":
main()
12 changes: 6 additions & 6 deletions train/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from dahuffman.huffmancodec import ensure_dir

DOWNLOADS = Path(__file__).parent / 'data'
CODECS = Path(__file__).parent / 'codecs'
DOWNLOADS = Path(__file__).parent / "data"
CODECS = Path(__file__).parent / "codecs"

_log = logging.getLogger()

Expand All @@ -15,13 +15,13 @@ def download(url: str, path: str) -> Path:
path = DOWNLOADS / path
if not path.exists():
ensure_dir(path.parent)
_log.info(f'Downloading {url}')
_log.info(f"Downloading {url}")
with requests.get(url) as r:
r.raise_for_status()
with path.open('wb') as f:
with path.open("wb") as f:
for chunk in r.iter_content(chunk_size=1024):
f.write(chunk)
_log.info(f'Downloaded to {path}')
_log.info(f"Downloaded to {path}")
else:
_log.info(f'{path} already exists. (Not downloading from {url} again)')
_log.info(f"{path} already exists. (Not downloading from {url} again)")
return path
14 changes: 8 additions & 6 deletions train/xml-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from collections import Counter

from dahuffman import HuffmanCodec
from train.train_utils import download, CODECS
from train.train_utils import CODECS, download

_log = logging.getLogger()

Expand Down Expand Up @@ -34,21 +34,23 @@ def main():
"https://data.mo.gov/api/views/vpge-tj3s/rows.xml",
]

_log.info('Building frequency tables')
_log.info("Building frequency tables")
frequencies = Counter()
for url in urls:
path = download(url, 'xml-data/' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.xml')
with path.open('r') as f:
path = download(
url, "xml-data/" + hashlib.md5(url.encode("utf-8")).hexdigest() + ".xml"
)
with path.open("r") as f:
# Only take first N bytes.
# Large files probably have a lot of structural repetition, which skews the frequencies
raw = f.read(100000)
frequencies.update(raw)

# TODO add more metadata
_log.info(f'Frequencies raw {len(frequencies)}: {frequencies}')
_log.info(f"Frequencies raw {len(frequencies)}: {frequencies}")
codec = HuffmanCodec.from_frequencies(frequencies)
codec.save(CODECS / "xml.pickle", metadata={"frequencies": frequencies})


if __name__ == '__main__':
if __name__ == "__main__":
main()

0 comments on commit 8cd37f8

Please sign in to comment.