Skip to content

Commit

Permalink
Merge pull request #53 from laserson/EGGO-18-config
Browse files Browse the repository at this point in the history
[EGGO-18] WIP: Setup config files
  • Loading branch information
laserson committed May 7, 2015
2 parents 6f2bd8b + 9f68db6 commit ff9f1b8
Show file tree
Hide file tree
Showing 17 changed files with 673 additions and 474 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ converting the data sets yourself, simply get them from the eggo S3 bucket:
s3://bdg-eggo
```

Eggo also provides a command-line interface for easily provisioning Hadoop
clusters in the cloud (built using Fabric) and also the necessary code to
convert the data sets from the legacy formats into the Hadoop-friendly versions
(built with Luigi).

## User interface

Not implemented yet.
Expand Down Expand Up @@ -151,7 +156,7 @@ export EPHEMERAL_MOUNT=/tmp
export ADAM_HOME=~/workspace/adam
export HADOOP_HOME=~/sw/hadoop-2.5.1/
export SPARK_HOME=~/sw/spark-1.3.0-bin-hadoop2.4/
export SPARK_MASTER_URL=local
export SPARK_MASTER_URI=local
export STREAMING_JAR=$HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-2.5.1.jar
export PATH=$PATH:$HADOOP_HOME/bin
```
Expand Down
54 changes: 0 additions & 54 deletions bin/download_upload_mapper.sh

This file was deleted.

42 changes: 15 additions & 27 deletions bin/download_upload.sh → bin/eggo
Original file line number Diff line number Diff line change
Expand Up @@ -15,34 +15,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# args: EPHEMERAL_MOUNT SOURCE_URL COMPRESSION_TYPE TMP_S3_PATH FINAL_S3_PATH
# COMPRESSION_TYPE can be NONE or GZIP
# TODO: weaken the requirements of setting these variables. If eggo is pip-
# installed in the future, then EGGO_HOME can default to the site-packages
# install location. If it's just an "end-user" looking to browse available
# data sets etc., there may not be need for the user to have EGGO_CONF set

# download the file locally
if [ -f /root/eggo/eggo-ec2-variables.sh ]; then
source /root/eggo/eggo-ec2-variables.sh
if [ -z "$EGGO_HOME" ]; then
echo >&2 "EGGO_HOME is unset. Aborting."
exit 1
fi
export EGGO_TMP_DIR=$(mktemp -d --tmpdir=$1 tmp_eggo_XXXX)
pushd $EGGO_TMP_DIR
curl -L -O $2

# decompress if necessary
case $3 in
NONE)
;;
GZIP)
gunzip *.gz
;;
*)
echo "Expected NONE or GZIP; got $3."
popd
rm -rf $EGGO_TMP_DIR
exit 1
;;
esac
if [ -z "$EGGO_CONFIG" ]; then
echo >&2 "EGGO_CONFIG is unset. Aborting."
exit 1
fi

# Check if fabric is installed
command -v fab >/dev/null 2>&1 || { echo >&2 "fab not found on PATH. Aborting."; exit 1; }

# upload to S3
aws s3 cp ./* $4
aws s3 mv $4 $5
popd
rm -rf $EGGO_TMP_DIR
fab -f "$EGGO_HOME/eggo/fabric_cli.py" "$@"
3 changes: 1 addition & 2 deletions bin/toaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@

"""Exposes top-level Luigi Tasks associated with ETLing the datasets."""

# these classes must accept a "config" parameter which takes the json file
# Only top-level "user-facing" Luigi DAGs should be imported here

from eggo.dag import VCF2ADAMTask, BAM2ADAMTask


if __name__ == '__main__':
from luigi import run
run()
6 changes: 0 additions & 6 deletions client.cfg

This file was deleted.

File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
"AWSTemplateFormatVersion" : "2010-09-09",
"Description" : "This template creates a VPC infrastructure for a single-AZ, single public subnet deployment of CDH",
"Parameters" : {

"KeyPairName" : { "Type" : "String" },
"DMZCIDR" : { "Type" : "String", "Default" : "10.1.1.0/24" },
"VPCCIDR" : { "Type" : "String", "Default" : "10.1.0.0/16" },
"AZ" : { "Type" : "String", "Default" : "us-east-1b" },
"DomainDNSName" : { "Type" : "String", "Default" : "ec2.internal" }
},

"Resources" : {

"VPC" : {
Expand All @@ -24,23 +24,23 @@
]
}
},

"DHCPOptions" : {
"Type" : "AWS::EC2::DHCPOptions",
"Properties" : {
"DomainName" : { "Ref" : "DomainDNSName"},
"DomainNameServers" : ["AmazonProvidedDNS"]
}
},

"VPCDHCPOptionsAssociation" : {
"Type" : "AWS::EC2::VPCDHCPOptionsAssociation",
"Properties" : {
"VpcId" : {"Ref" : "VPC"},
"DhcpOptionsId" : {"Ref" : "DHCPOptions"}
}
},

"DMZSubnet" : {
"Type" : "AWS::EC2::Subnet",
"Properties" : {
Expand All @@ -64,7 +64,7 @@
]
}
},

"ClusterSG" : {
"Type" : "AWS::EC2::SecurityGroup",
"Properties" : {
Expand All @@ -76,15 +76,15 @@
"VpcId" : { "Ref" : "VPC" }
}
},

"AttachGateway" : {
"Type" : "AWS::EC2::VPCGatewayAttachment",
"Properties" : {
"VpcId" : { "Ref" : "VPC" },
"InternetGatewayId" : { "Ref" : "InternetGateway" }
}
},

"DMZRouteTable" : {
"Type" : "AWS::EC2::RouteTable",
"Properties" : {
Expand All @@ -95,7 +95,7 @@
]
}
},

"DMZRoute" : {
"Type" : "AWS::EC2::Route",
"Properties" : {
Expand All @@ -104,7 +104,7 @@
"GatewayId" : { "Ref" : "InternetGateway" }
}
},

"DMZSubnetRouteTableAssociation" : {
"Type" : "AWS::EC2::SubnetRouteTableAssociation",
"Properties" : {
Expand All @@ -113,7 +113,7 @@
}
}
},

"Outputs" : {
"VPC" : {
"Value" : { "Ref" : "VPC" },
Expand All @@ -127,5 +127,5 @@
"Value" : { "Ref" : "ClusterSG"},
"Description" : "Cluster security group"
}
}
}
}
92 changes: 92 additions & 0 deletions conf/eggo/eggo.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
; Eggo "global" config

; NOTE: to access the EGGO_HOME env variable, use "%(eggo_home)s"

[core]
; The location of the result data sets
; supports S3 (s3n://), HDFS (hdfs://), and local (file://) targets
; the specified URL will be the root of the eggo directory structure (see spec)
eggo_base_url: s3n://bdg-eggo

; The context for execution
; Possible values are spark_ec2, director, local
execution: spark_ec2

[versions]
eggo_fork: laserson
eggo_branch: EGGO-18-config
adam_fork: bigdatagenomics
adam_branch: master
maven: 3.2.5


[paths]
; Path to stage the raw input data on the target distributed fs
; raw data ends up in <eggo_base_url>/<dfs_raw_data_prefix>/<dataset_name>
dfs_raw_data_prefix: raw

; Path to store tmp/intermediate data in the target distributed fs
; tmp data ends up in <eggo_base_url>/<dfs_tmp_data_dir>/<dataset_name>/<random_id>
dfs_tmp_data_prefix: tmp

; Absolute path to a location on the remote worker's local fs where eggo can
; write data to. e.g., on EC2 machines, this could be the ephemeral drive mount
; point /mnt
worker_data_dir: /mnt

; Random identifier that is generated on module load. Do not set this manually
;random_id: <generated-on-load>


[local_env]
; Can be overridden by setting SPARK_HOME env var
spark_home: $SPARK_HOME ; hack: this var gets interp into shell cmds so env var will fill it


[worker_env]
eggo_config_worker_path: /mnt/eggo.cfg ; current file is copied to remote location

; Comma-separated list of files to source in .bash_profile on remote worker
files_to_source: /root/spark-ec2/ec2-variables.sh

; Environment variables to set on the worker nodes in .bash_profiletheir values
; can depend on variables set in %(files_to_source)s, as they will be sourced
; first
hadoop_home: /root/ephemeral-hdfs
spark_home: /root/spark
streaming_jar: %(hadoop_home)s/contrib/streaming/hadoop-streaming-1.0.4.jar
spark_master_uri: spark://$MASTERS:7077
;eggo_home: <not configurable, but set as EGGO_HOME>
;luigi_config_path: <not configurable, but set as $EGGO_HOME/conf/luigi/luigi.cfg


[aws]
; These can be set/overridden by setting corresponding local env vars (in ALL_CAPS)
;aws_access_key_id: <MY_ACCESS_KEY>
;aws_secret_access_key: <MY_SECRET_KEY>
;ec2_key_pair: <MY_KEY>
;ec2_private_key_file: <MY_KEY_FILE>


[spark_ec2]
; Spark EC2 scripts configuration
; Path to local Spark installation
region: us-east-1
instance_type: r3.2xlarge
num_slaves: 2
user: root


[director]
; Cloudera Director configuration
; TODO: add docs to these options
region: us-east-1
launcher_instance_type: m3.large
launcher_ami: ami-a25415cb ; RHEL 6.4 x86
cluster_ami: %(launcher_ami)s
stack_name: bdg-eggo
; the pointers to the director configs are executed relative to CWD (which may
; be the same as EGGO_HOME); set them to an absolute path if desired, or use
; "%(eggo_home)s" to access the EGGO_HOME env variable
cloudformation_template: conf/director/cfn-cloudera-us-east-1-public-subnet.template
director_conf_template: conf/director/aws.conf
File renamed without changes.
Loading

0 comments on commit ff9f1b8

Please sign in to comment.