Skip to content

Commit

Permalink
[EGGO-18] WIP: Setup config files
Browse files Browse the repository at this point in the history
  • Loading branch information
laserson committed May 2, 2015
1 parent 6f2bd8b commit bdb9838
Show file tree
Hide file tree
Showing 7 changed files with 258 additions and 109 deletions.
3 changes: 2 additions & 1 deletion client.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Luigi client config

[core]
logging_conf_file: luigi_logging.ini

[hadoop]
command: hadoop
# command: /root/ephemeral-hdfs/bin/hadoop
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
"AWSTemplateFormatVersion" : "2010-09-09",
"Description" : "This template creates a VPC infrastructure for a single-AZ, single public subnet deployment of CDH",
"Parameters" : {

"KeyPairName" : { "Type" : "String" },
"DMZCIDR" : { "Type" : "String", "Default" : "10.1.1.0/24" },
"VPCCIDR" : { "Type" : "String", "Default" : "10.1.0.0/16" },
"AZ" : { "Type" : "String", "Default" : "us-east-1b" },
"DomainDNSName" : { "Type" : "String", "Default" : "ec2.internal" }
},

"Resources" : {

"VPC" : {
Expand All @@ -24,23 +24,23 @@
]
}
},

"DHCPOptions" : {
"Type" : "AWS::EC2::DHCPOptions",
"Properties" : {
"DomainName" : { "Ref" : "DomainDNSName"},
"DomainNameServers" : ["AmazonProvidedDNS"]
}
},

"VPCDHCPOptionsAssociation" : {
"Type" : "AWS::EC2::VPCDHCPOptionsAssociation",
"Properties" : {
"VpcId" : {"Ref" : "VPC"},
"DhcpOptionsId" : {"Ref" : "DHCPOptions"}
}
},

"DMZSubnet" : {
"Type" : "AWS::EC2::Subnet",
"Properties" : {
Expand All @@ -64,7 +64,7 @@
]
}
},

"ClusterSG" : {
"Type" : "AWS::EC2::SecurityGroup",
"Properties" : {
Expand All @@ -76,15 +76,15 @@
"VpcId" : { "Ref" : "VPC" }
}
},

"AttachGateway" : {
"Type" : "AWS::EC2::VPCGatewayAttachment",
"Properties" : {
"VpcId" : { "Ref" : "VPC" },
"InternetGatewayId" : { "Ref" : "InternetGateway" }
}
},

"DMZRouteTable" : {
"Type" : "AWS::EC2::RouteTable",
"Properties" : {
Expand All @@ -95,7 +95,7 @@
]
}
},

"DMZRoute" : {
"Type" : "AWS::EC2::Route",
"Properties" : {
Expand All @@ -104,7 +104,7 @@
"GatewayId" : { "Ref" : "InternetGateway" }
}
},

"DMZSubnetRouteTableAssociation" : {
"Type" : "AWS::EC2::SubnetRouteTableAssociation",
"Properties" : {
Expand All @@ -113,7 +113,7 @@
}
}
},

"Outputs" : {
"VPC" : {
"Value" : { "Ref" : "VPC" },
Expand All @@ -127,5 +127,5 @@
"Value" : { "Ref" : "ClusterSG"},
"Description" : "Cluster security group"
}
}
}
}
45 changes: 45 additions & 0 deletions conf/eggo/eggo.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
; Eggo "global" config

[paths]
; The location of the result data sets
; supports S3 (s3n://), HDFS (hdfs://), and local (file://) targets
; the specified URL will be the root of the eggo directory structure (see spec)
eggo_base_url: s3n://bdg-eggo

; Path to stage the raw input data on the target distributed fs
raw_data_prefix: raw

; Path to store tmp/intermediate data in the target distributed fs
; tmp data always ends up in eggo_base_url/tmp/dataset_name/tmp_data_prefix/...
;tmp_data_prefix: ; if not set manually, module load generates random prefix

; Absolute path to store tmp data on remote worker machines' local file systems
; e.g., on EC2 machines, this could be the ephemeral drive mount point /mnt
worker_local_tmp_data_dir: /mnt


[hadoop]
; NOTE: make sure the Luigi hadoop config is appropriately set in client.cfg
hadoop_home: /root/ephemeral-hdfs ; overridden by HADOOP_HOME env var
streaming_jar: ; this will override the similar option in Luigi's client.cfg

[aws]
; These will all be overridden by corresponding env variables (in ALL_CAPS)
aws_access_key_id: <MY_ACCESS_KEY>
aws_secret_access_key: <MY_SECRET_KEY>
ec2_key_pair: <MY_KEY>
ec2_private_key_file: <MY_KEY_FILE>


[director]
; TODO: add docs to these options
region: us-east-1
launcher_instance_type: m3.large
launcher_ami: ami-a25415cb ; RHEL 6.4 x86
cluster_ami: %(launcher_ami)s
stack_name: bdg-eggo-%(ec2_key_pair)s
; the pointers to the director configs are executed relative to CWD (which may
; be the same as EGGO_HOME); set them to an absolute path if desired, or use
; "%(eggo_home)s" to access the EGGO_HOME env variable
cloudformation_template: conf/director/cfn-cloudera-us-east-1-public-subnet.template
director_conf_template: conf/director/aws.conf
61 changes: 52 additions & 9 deletions eggo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,62 @@
# limitations under the License.

import os
from ConfigParser import SafeConfigParser

from eggo.util import random_id

# path to store raw input data
RAW_DATA_KEY_PREFIX = 'raw'
# each module load/invocation will generate a new temp location in the distributed fs
TMP_DATA_KEY_PREFIX = random_id()

EGGO_BASE_URL = os.environ.get('EGGO_BASE_URL', 's3n://bdg-eggo')
EGGO_TMP_URL = os.path.join(EGGO_BASE_URL, TMP_DATA_KEY_PREFIX)
EGGO_RAW_URL = os.path.join(EGGO_BASE_URL, RAW_DATA_KEY_PREFIX)
# EGGO CONFIGURATION

def _init_eggo_config():
# if EGGO_HOME is set, add it to the config
if 'EGGO_HOME' in os.environ:
eh = {'eggo_home': os.environ['EGGO_HOME']}

def validate_config(d):
"""Validate a JSON config file for an eggo dataset"""
eggo_config = SafeConfigParser(defaults=eh,
dict_type=dict,
allow_no_value=False)

with open(os.environ['EGGO_CONFIG'], 'r') as ip:
eggo_config.readfp(ip, os.environ['EGGO_CONFIG'])

# Get HADOOP_HOME from env var if set
if 'HADOOP_HOME' in os.environ:
eggo_config.set('hadoop',
'hadoop_home',
os.environ['HADOOP_HOME'])

# Set AWS variables from environment if available
if 'AWS_ACCESS_KEY_ID' in os.environ:
eggo_config.set('aws',
'aws_access_key_id',
os.environ['AWS_ACCESS_KEY_ID'])
if 'AWS_SECRET_ACCESS_KEY' in os.environ:
eggo_config.set('aws',
'aws_secret_access_key',
os.environ['AWS_SECRET_ACCESS_KEY'])
if 'EC2_KEY_PAIR' in os.environ:
eggo_config.set('aws',
'ec2_key_pair',
os.environ['EC2_KEY_PAIR'])
if 'EC2_PRIVATE_KEY_FILE' in os.environ:
eggo_config.set('aws',
'ec2_private_key_file',
os.environ['EC2_PRIVATE_KEY_FILE'])

# Set tmp_data_prefix if not provided in the config file; each module
# load/invocation will generate a new tmp location in the distributed fs
if not eggo_config.has_option('paths', 'tmp_data_prefix'):
eggo_config.set('paths', 'tmp_data_prefix', random_id())

return eggo_config


eggo_config = _init_eggo_config()


# TOAST CONFIGURATION

def validate_toast_config(d):
"""Validate a JSON config file for an eggo dataset (a "toast")."""
pass
Loading

0 comments on commit bdb9838

Please sign in to comment.