Skip to content

Commit

Permalink
Merge pull request #40 from danielfromearth/feature/issues-8-and-10
Browse files Browse the repository at this point in the history
Feature/issues 8 and 10
  • Loading branch information
danielfromearth authored Nov 7, 2023
2 parents 3598db4 + a833be2 commit 7d7de29
Show file tree
Hide file tree
Showing 14 changed files with 856 additions and 225 deletions.
103 changes: 103 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
TEMPO_*.nc
tests/
pytest.ini
.git
.gitignore
.travis.yaml
.swagger-codegen-ignore
tox.ini

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
venv/
.python-version

# Translations
*.mo
*.pot

# Django stuff:
*.log

# Sphinx documentation
docs/_build/

# PyBuilder
target/

#Ipython Notebook
.ipynb_checkpoints

# Intellij project settings
.idea

# VSCode project settings
.vscode

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# ruff
.ruff_cache/
# pytest
.pytest_cache/

# Pyre type checker
.pyre/

# Github integration settings
.github

# .dockerignore and builder script themselves are not needed by
# docker build.
.dockerignore
build-service
6 changes: 0 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,3 @@ repos:
- id: yamllint
args: ["-d {extends: relaxed, rules: {line-length: {max: 120}}}"]
stages: [commit, push]

- repo: https://github.com/pryorda/dockerfilelint-precommit-hooks
rev: v0.1.0
hooks:
- id: dockerfilelint
stages: [commit, push]
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added
- [PR #1](https://github.com/danielfromearth/stitchee/pull/1): An initial GitHub Actions workflow
- [Issue #8](https://github.com/danielfromearth/stitchee/issues/8): Create working Docker image
- [Issue #10](https://github.com/danielfromearth/stitchee/issues/10): Add code necessary to communicate with Harmony
### Changed
- [PR #12](https://github.com/danielfromearth/stitchee/pull/12): Changed name to "stitchee"
- [PR #15](https://github.com/danielfromearth/stitchee/pull/15): Use ruff+black chain for pre-commit lint & format
Expand Down
47 changes: 47 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@

FROM python:3.10-slim

RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get upgrade -y \
gcc \
libnetcdf-dev \
#libhdf5-dev \
#hdf5-helpers \
&& pip3 install --upgrade pip \
&& pip3 install cython \
&& pip3 install poetry \
&& apt-get clean && rm -rf /var/lib/apt/lists/*


# Create a new user
RUN adduser --quiet --disabled-password --shell /bin/sh --home /home/dockeruser --gecos "" --uid 1000 dockeruser
USER dockeruser
ENV HOME /home/dockeruser
ENV PYTHONPATH "${PYTHONPATH}:/home/dockeruser/.local/bin"
ENV PATH="/home/dockeruser/.local/bin:${PATH}"

# The 'SOURCE' argument is what will be used in 'pip install'.
ARG SOURCE

# Set this argument if running the pip install on a local directory, so
# the local dist files are copied into the container.
ARG DIST_PATH

USER root
RUN mkdir -p /worker && chown dockeruser /worker
COPY pyproject.toml /worker

WORKDIR /worker
# ENV PYTHONPATH=${PYTHONPATH}:${PWD}
COPY --chown=dockeruser $DIST_PATH $DIST_PATH
#RUN pip3 install --no-cache-dir --force --user --index-url https://pypi.org/simple/ --extra-index-url https://test.pypi.org/simple/ $SOURCE \
# && rm -rf $DIST_PATH

#install poetry as root
RUN poetry config virtualenvs.create false
RUN poetry install --only main

USER dockeruser
COPY --chown=dockeruser ./docker-entrypoint.sh docker-entrypoint.sh
# Run the service
ENTRYPOINT ["./docker-entrypoint.sh"]
3 changes: 3 additions & 0 deletions build-service
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

docker build -t "asdc-trade/stitchee:${VERSION-latest}" .
Empty file.
33 changes: 33 additions & 0 deletions concatenator/harmony/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""A Harmony CLI wrapper around the concatenate-batcher"""
from argparse import ArgumentParser

import harmony

from concatenator.harmony.service_adapter import StitcheeAdapter as HarmonyAdapter


def main(config: harmony.util.Config = None) -> None:
"""Parse command line arguments and invoke the service to respond to them.
Parameters
----------
config : harmony.util.Config
harmony.util.Config is injectable for tests
Returns
-------
None
"""
parser = ArgumentParser(
prog="Stitchee", description="Run the STITCH by Extending a dimEnsion service"
)
harmony.setup_cli(parser)
args = parser.parse_args()
if harmony.is_harmony_cli(args):
harmony.run_cli(parser, args, HarmonyAdapter, cfg=config)
else:
parser.error("Only --harmony CLIs are supported")


if __name__ == "__main__":
main()
119 changes: 119 additions & 0 deletions concatenator/harmony/download_worker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""A utility for downloading multiple granules simultaneously"""

import queue
import re
from copy import deepcopy
from multiprocessing import Manager, Process
from os import cpu_count
from pathlib import Path
from urllib.parse import urlparse

from harmony.logging import build_logger
from harmony.util import download


def multi_core_download(
urls: list, destination_dir: str, access_token: str, cfg: dict, process_count: int | None = None
) -> list[Path]:
"""
A method which automagically scales downloads to the number of CPU
cores. For further explaination, see documentation on "multi-track
drifting"
Parameters
----------
urls : list
list of urls to download
destination_dir : str
output path for downloaded files
access_token : str
access token as provided in Harmony input
cfg : dict
Harmony configuration information
process_count : int, optional
Number of worker processes to run (expected >= 1)
Returns
-------
list
list of downloaded files as pathlib.Path objects
"""

if process_count is None:
process_count = cpu_count()
if process_count is None:
raise RuntimeError("cannot determine number of cpus")

with Manager() as manager:
url_queue = manager.Queue(len(urls))
path_list = manager.list()

for url in urls:
url_queue.put(url)

# Spawn worker processes
processes = []
for _ in range(process_count):
download_process = Process(
target=_download_worker,
args=(url_queue, path_list, destination_dir, access_token, cfg),
)
processes.append(download_process)
download_process.start()

# Ensure worker processes exit successfully
for process in processes:
process.join()
if process.exitcode != 0:
raise RuntimeError(f"Download failed - exit code: {process.exitcode}")

process.close()

path_list = deepcopy(path_list) # ensure GC can cleanup multiprocessing

return [Path(path) for path in path_list]


def _download_worker(
url_queue: queue.Queue, path_list: list, destination_dir: str, access_token: str, cfg: dict
) -> None:
"""
A method to be executed in a separate process which processes the url_queue
and places paths to completed downloads into the path_list. Downloads are
handled by harmony.util.download
Parameters
----------
url_queue : queue.Queue
URLs to process - should be filled from start and only decreases
path_list : list
paths to completed file downloads
destination_dir : str
output path for downloaded files
access_token : str
access token as provided in Harmony input
cfg : dict
Harmony configuration information
"""

logger = build_logger(cfg)

while not url_queue.empty():
try:
url = url_queue.get_nowait()
except queue.Empty:
break

path = Path(
download(url, destination_dir, logger=logger, access_token=access_token, cfg=cfg)
)
filename_match = re.match(r".*\/(.+\..+)", urlparse(url).path)

if filename_match is not None:
filename = filename_match.group(1)
dest_path = path.parent.joinpath(filename)
path = path.rename(dest_path)
else:
logger.warning("Origin filename could not be assertained - %s", url)

path_list.append(str(path))
Loading

0 comments on commit 7d7de29

Please sign in to comment.