Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CGHub download #38

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions eggo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
EGGO_S3N_TMP_URL = os.path.join(EGGO_S3N_BUCKET_URL, S3_TMP_DATA_KEY_PREFIX)
EGGO_S3N_RAW_URL = os.path.join(EGGO_S3N_BUCKET_URL, S3_RAW_DATA_KEY_PREFIX)

CGHUB_PUBLIC_KEY = 'https://cghub.ucsc.edu/software/downloads/cghub_public.key'

def validate_config(d):
"""Validate a JSON config file for an eggo dataset"""
Expand Down
50 changes: 43 additions & 7 deletions eggo/dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

from eggo.config import (
validate_config, EGGO_S3_BUCKET_URL, EGGO_S3N_BUCKET_URL, EGGO_S3_RAW_URL,
EGGO_S3N_RAW_URL, EGGO_S3_TMP_URL)
EGGO_S3N_RAW_URL, EGGO_S3_TMP_URL, CGHUB_PUBLIC_KEY)
from eggo.util import random_id, build_s3_filename


Expand Down Expand Up @@ -69,6 +69,40 @@ class ToastConfig(Config):
config = JsonFileParameter()


def _cghub_download(url, tmp_dir, cghub_key=None, n_threads=8):
"""Download from CGHub to TMP_DIR.

Requires GeneTorrent. Download client `gtdownload` must be
on PATH. Use public key if none provided and CGHUB_KEY
environment variable not set. Returns analysis subdirectory.
"""
# 1. Check env for CGHub key and substitute public if necessary
if cghub_key is None:
cghub_key = os.environ.get('CGHUB_KEY') or CGHUB_PUBLIC_KEY

# 2. Parse url for analysis ID and filename
analysis_id, filename = url.lstrip('cghub://').split('/')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the split ever produce more than two objects?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

analysis_id is a CGHub concept?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, the CGHub metadata store centers around the analysis_id. It refers to a single downloadable object (which may contain multiple files). I made up a cghub url to fit the existing registry structure, so it is easy to change. Right now I am creating them to only contain the analysis ID and a single filename of interest (BAM file, generally), so this would always split into 2 objects.

However, the CGHub REST API returns a JSON with lots of metadata for a given analysis_id. One option would be to store this full JSON in the registry, though it would be long and cluttered, not as easily human-readable. Or we could just use the analysis ID and have the code call the API to get the filename, filesize, and other metadata if necessary.


# 3. Download with gtdownload
cmd = 'gtdownload -c {keypath} -p {prefix} --max-children {threads} -v {analysis_id}'
p = Popen(cmd.format(keypath=cghub_key, prefix=tmp_dir, threads=n_threads,
analysis_id=analysis_id), shell=True)
p.wait()

return os.path.join(tmp_dir, analysis_id)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer to get rid of this return statement to be consistent with _http_download.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure. this is to prevent nested folders in the eggo bucket, since gtdownload creates a folder named by analysis_id and stores the files inside it. I'll have it copy them out into tmp_dir instead



def _http_download(url, tmp_dir):
"""Download URL via HTTP

Requires curl
"""
dnload_cmd = 'pushd {tmp_dir} && curl -L -O {source} && popd'
p = Popen(dnload_cmd.format(tmp_dir=tmp_dir, source=url),
shell=True)
p.wait()


def _dnload_to_local_upload_to_s3(source, destination, compression):
# source: (string) URL suitable for curl
# destination: (string) full S3 path of destination file name
Expand All @@ -78,10 +112,12 @@ def _dnload_to_local_upload_to_s3(source, destination, compression):
tmp_dir = mkdtemp(prefix='tmp_eggo_', dir=EPHEMERAL_MOUNT)

# 1. dnload file
dnload_cmd = 'pushd {tmp_dir} && curl -L -O {source} && popd'
p = Popen(dnload_cmd.format(tmp_dir=tmp_dir, source=source),
shell=True)
p.wait()
if source.startswith('http'):
_http_download(source, tmp_dir)
elif source.startswith('cghub'):
tmp_dir = _cghub_download(source, tmp_dir)
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about ftp?

raise ValueError('source must be http(s) or cghub url')

# 2. decompress if necessary
if compression:
Expand All @@ -96,13 +132,13 @@ def _dnload_to_local_upload_to_s3(source, destination, compression):

# 3. upload to tmp S3 location
tmp_s3_path = os.path.join(EGGO_S3_TMP_URL, random_id())
upload_cmd = 'pushd {tmp_dir} && aws s3 cp ./* {s3_path} && popd'
upload_cmd = 'pushd {tmp_dir} && aws s3 cp . {s3_path} --recursive && popd'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the rationale behind this change? At the moment, I think that additional levels of nested directories will break our URL scheme. But perhaps this is something we need to support? cc @tomwhite

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

globs weren't working for me with aws s3 cp when testing locally with the latest version of awscli, but --recursive worked fine. Didn't know you were purposely restricting nested folders. What version are you using?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, I'm pretty sure the latest version gets installed on the cluster. This change is fine with me as long as the result is the same. But haven't had a problem yet.

p = Popen(upload_cmd.format(tmp_dir=tmp_dir, s3_path=tmp_s3_path),
shell=True)
p.wait()

# 4. rename to final target location
rename_cmd = 'aws s3 mv {tmp_path} {final_path}'
rename_cmd = 'aws s3 mv {tmp_path} {final_path} --recursive'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment

p = Popen(rename_cmd.format(tmp_path=tmp_s3_path,
final_path=destination),
shell=True)
Expand Down
8 changes: 8 additions & 0 deletions eggo/fabric_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,14 @@ def _install_maven(version):
run('mvn -version')


def _install_genetorrent():
run('mkdir -p /usr/local/genetorrent')
with cd('/usr/local/genetorrent'):
run('wget https://cghub.ucsc.edu/software/downloads/GeneTorrent/3.8.7/GeneTorrent-download-3.8.7-207-CentOS6.4.x86_64.tar.gz')
run('tar -xvf GeneTorrent-download-3.8.7-207-CentOS6.4.x86_64.tar.gz')
run('echo "export PATH=$PATH:/usr/local/genetorrent/cghub/bin" >> ~/.bash_profile')


def _install_adam():
# check out latest adam master
with cd('~'):
Expand Down
942 changes: 942 additions & 0 deletions registry/ccle-rnaseq.json

Large diffs are not rendered by default.

30 changes: 30 additions & 0 deletions registry/ccle-wgs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"name": "ccle-wgs",
"description": "Cancer Cell Line Encyclopedia whole genome sequencing",
"dag": "BAM2ADAMTask",
"sources": [
{"name": "CCLE-HCC1143-DNA-08", "format": "bam", "compression": false, "url": "cghub://c727c60e-8ddd-1ddc-e040-ad451e414a77/G15511.HCC1143.2.bam"},
{"name": "CCLE-HCC1143BL-DNA-08", "format": "bam", "compression": false, "url": "cghub://c727c60e-95b8-1455-e040-ad451e414a79/G15511.HCC1143_BL.2.bam"},
{"name": "TEST-DK-A1AE-01A-11D-A13U-02", "format": "bam", "compression": false, "url": "cghub://c727c612-1be1-8c27-e040-ad451e414a7f/G15512.HCC1954_BL.5.bam"},
{"name": "CCLE-K-562-DNA-08", "format": "bam", "compression": false, "url": "cghub://c72531e9-0e79-91c5-e040-ad451e411a96/G15509.K-562.2.bam"},
{"name": "CCLE-HCC1954-DNA-08", "format": "bam", "compression": false, "url": "cghub://c727c60e-00b0-a33d-e040-ad451e414a75/G15512.HCC1954.5.bam"},
{"name": "CCLE-HCC1143-v2-DNA-08", "format": "bam", "compression": false, "url": "cghub://dc9dd275-7e93-4ce0-838e-47ca7b5751a6/G31860.HCC1143.1.bam"},
{"name": "CCLE-HCC1954-v1-DNA-08", "format": "bam", "compression": false, "url": "cghub://84a23558-26b3-42f4-a9f3-d83d118f2327/G31860.HCC1954.6.bam"},
{"name": "CCLE-HCC1143BL-DNA-08", "format": "bam", "compression": false, "url": "cghub://c1ea1ce7-3144-447b-bb9d-6e660cff48ce/G31860.HCC1143BL.1.bam"},
{"name": "CCLE-HCC1954BL-DNA-08", "format": "bam", "compression": false, "url": "cghub://74fd150b-c807-4c86-9e80-43650698e728/G31860.HCC1954BL.1.bam"},
{"name": "CCLE-HCC1954-DNA-09", "format": "bam", "compression": false, "url": "cghub://04e5603b-3990-4ba3-80cd-5785986332cb/19a6a7e8a4f65d8da403ee991c0fd433.bam"},
{"name": "CCLE-HCC1143-DNA-09", "format": "bam", "compression": false, "url": "cghub://3fb70f6d-11b0-44f0-b89e-cb538782d5be/48f412002a88837bf75ae9a01adfc09a.bam"},
{"name": "CCLE-HCC1143BL-DNA-09", "format": "bam", "compression": false, "url": "cghub://7cf7dec4-6f85-496d-a396-318d33b4237b/4fa1bd6b37497f6b8bda7c221bcc4e1a.bam"},
{"name": "CCLE-HCC1954BL-DNA-09", "format": "bam", "compression": false, "url": "cghub://ca8ac363-f9ca-4d7a-ab73-20299581361b/acf32da4fec9eefd1018e4c5170653b7.bam"},
{"name": "CCLE-HCC1143-DNA-10", "format": "bam", "compression": false, "url": "cghub://1aa893cb-72c2-45ac-bcc9-25b21562ab21/CCLE-HCC1143-DNA-10_mate-pair_Illumina.bam"},
{"name": "CCLE-HCC1954-DNA-10", "format": "bam", "compression": false, "url": "cghub://0678564a-6526-403d-887e-bc1d9a76ef29/CCLE-HCC1954-DNA-10_mate-pair_Illumina.bam"},
{"name": "CCLE-HCC1143-DNA-10", "format": "bam", "compression": false, "url": "cghub://aafe59bb-46e1-4c01-bbff-c89859b1c2fe/CCLE-HCC1143-DNA-10_Illumina.bam"},
{"name": "CCLE-HCC1143BL-DNA-10", "format": "bam", "compression": false, "url": "cghub://c03af4dc-bb34-4a24-af8c-69df43866e66/CCLE-HCC1143BL-DNA-10_Illumina.bam"},
{"name": "CCLE-HCC1954BL-DNA-10", "format": "bam", "compression": false, "url": "cghub://c4cffb4e-7475-4fea-8ec1-08a89d1c957e/CCLE-HCC1954BL-DNA-10_Illumina.bam"},
{"name": "CCLE-HCC1954-DNA-10", "format": "bam", "compression": false, "url": "cghub://23939629-c789-46ed-ad52-415622000056/CCLE-HCC1954-DNA-10_Illumina.bam"},
{"name": "CCLE-HCC1143BL-DNA-08", "format": "bam", "compression": false, "url": "cghub://f0e84c18-bce1-4489-a3ea-ff033deb9ce3/D4491.Solexa-178366.3.bam"},
{"name": "CCLE-HCC1143-v2-DNA-08", "format": "bam", "compression": false, "url": "cghub://d9ac82a7-a9d3-4545-99f2-b8e7ead79c9c/D4491.Solexa-178365.2.bam"},
{"name": "CCLE-HCC1954-v1-DNA-08", "format": "bam", "compression": false, "url": "cghub://55c6e4a3-057a-4a55-a2af-b7f113422d3a/D4491.Solexa-178364.2.bam"},
{"name": "CCLE-HCC1954BL-DNA-08", "format": "bam", "compression": false, "url": "cghub://dd630fed-573f-4726-958b-92639997b20d/D4491.Solexa-178367.2.bam"}
]
}
Loading