-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathaws-cf_template.yml
270 lines (268 loc) · 11.4 KB
/
aws-cf_template.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
AWSTemplateFormatVersion: '2010-09-09'
Description: Airflow server
Parameters:
KeyName:
Description: Name of an existing EC2 KeyPair to enable SSH access into the Airflow web server
Type: AWS::EC2::KeyPair::KeyName
ConstraintDescription: Must be the name of an existing EC2 KeyPair
OutputDBHost:
Description: REQUIRED - Can be a local path or S3 bucket path (With format "s3a://<bucket-name>" DO NOT END with "/"). This path will be used to read and write the short sale volume dataset.
Type: String
AllowedPattern: '.+'
QuandlAPIKey:
Description: REQUIRED - Quandl API Key
NoEcho: 'true'
Type: String
AllowedPattern: '.+'
AWSAccessKeyID:
Description: AWS Access Key ID that can access S3 bucket set in "OutputDBHost" and create EMR cluster.
Type: String
AWSSecretAccessKey:
Description: AWS Secret Access Key that can access S3 bucket set in "OutputDBHost" and create EMR cluster.
NoEcho: 'true'
Type: String
AirflowDBPassword:
Default: airflowpassword
NoEcho: 'true'
Description: Airflow database admin account password
Type: String
MinLength: '8'
MaxLength: '41'
AllowedPattern: '[a-zA-Z0-9]*'
ConstraintDescription: Must contain only alphanumeric characters
VPCId:
Type: AWS::EC2::VPC::Id
Description: VPC of the EC2 server
SubnetId:
Type: AWS::EC2::Subnet::Id
Description: Subnet of the EC2 server. Must belong in VPC from which you set VPCId from
InstanceType:
Type: String
Default: t3.small
Description: t2.micro is not enough to run both scheduler and webserver, but it is useful for debugging aws-cf_template.yml
AllowedValues:
- t2.micro
- t3.small
- t3.medium
StockLimits:
Type: String
AllowedPattern: '[ 0-9]*'
Description: Number of stocks to pull. Set to empty for all stocks
Stocks:
Type: String
Default: '[]'
Description: Pull only the following stocks. Overrides StockLimits. All stocks if empty
EMRNumCoreNodes:
Type: Number
Default: 3
Description: Number of EMR cores
EMRCoreNodeInstanceType:
Type: String
Default: m3.xlarge
Description: EMR core instance type
AllowedValues:
- m3.xlarge
- c3.xlarge
BranchToPull:
Type: String
Default: quantopian-only
Description: Branch from https://github.com/jaycode/short_sale_volume to pull
# Mapping to find the Amazon Linux AMI in each region.
Mappings:
RegionMap:
us-east-1:
AMI: ami-09d069a04349dc3cb
us-east-2:
AMI: ami-0d542ef84ec55d71c
us-west-1:
AMI: ami-04bc3da8f14823e88
us-west-2:
AMI: ami-01460aa81365561fe
ap-southeast-1:
AMI: ami-0d9233e8ce73df7b2
Resources:
EC2Instance:
Type: AWS::EC2::Instance
Properties:
KeyName: !Ref 'KeyName'
SecurityGroupIds: [!GetAtt AirflowEC2SecurityGroup.GroupId]
InstanceType: !Ref 'InstanceType'
SubnetId: !Ref 'SubnetId'
IamInstanceProfile:
Ref: EC2InstanceProfile
Tags:
-
Key: Name
Value: Airflow
ImageId: !FindInMap
- RegionMap
- !Ref 'AWS::Region'
- AMI
UserData:
Fn::Base64: !Sub |
#!/bin/bash
set -x
# To debug the outputs, run `cat /var/log/user-data.log`
exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1
# Get the latest CloudFormation package
echo "Installing aws-cfn"
yum install -y aws-cfn-bootstrap
# Start cfn-init
/opt/aws/bin/cfn-init -v -c install --stack ${AWS::StackId} --resource EC2Instance --region ${AWS::Region}
#
# Configure AWS
aws configure set aws_access_key_id ${AWSAccessKeyID}
aws configure set aws_secret_access_key ${AWSSecretAccessKey}
aws configure set region ${AWS::Region}
#
# Install Python3
sudo yum install -y python36
#
# Install PostgreSQL
sudo yum install -y postgresql postgresql-server postgresql-devel postgresql-contrib postgresql-docs
sudo service postgresql initdb
# Edit pg_hba.conf file
# Update the line that contains "local all..." by replacing "peer" with "trust"
sudo sed -i -e '/local all all/ s/peer/trust/' /var/lib/pgsql9/data/pg_hba.conf
# Update the line that contains "host all...::1/128..." by replacing "ident" with "md5"
sudo sed -i -e '/host all all \:\:1\/128/ s/ident/md5/' /var/lib/pgsql9/data/pg_hba.conf
# Delete line that contains '127.0.0.1/32':
sudo sed -i '/127\.0\.0\.1\/32/d' /var/lib/pgsql9/data/pg_hba.conf
# Add line under "# IPv4 local connections:"
sudo sed -i '/\# IPv4 local connections\:/ a host all airflow 0\.0\.0\.0\/0 md5' /var/lib/pgsql9/data/pg_hba.conf
# Update postgresql.conf to (locally) listen to port 5432
sudo sed -i -e '/\#listen_addresses/ s/\#l/l/' /var/lib/pgsql9/data/postgresql.conf
sudo sed -i -e '/\#port \= 5432/ s/\#//' /var/lib/pgsql9/data/postgresql.conf
# Start PostgreSQL service
sudo service postgresql start
# Create user and database for airflow db
sudo -u postgres psql -c "CREATE USER airflow WITH PASSWORD '${AirflowDBPassword}';"
sudo -u postgres psql -c "CREATE DATABASE airflowdb OWNER airflow;"
#
# Install git
sudo yum install -y git
# Download pipeline code
cd /home/ec2-user
git clone -b ${BranchToPull} https://github.com/jaycode/short_sale_volume.git
cd /home/ec2-user/short_sale_volume
sudo chmod -R 777 /home/ec2-user/short_sale_volume
# Install boto3
sudo python3 -m pip install boto3
# Install airflow using pip
echo "Install Apache Airflow"
sudo yum install -y python36-devel
sudo SLUGIFY_USES_TEXT_UNIDECODE=yes python3 -m pip install -U apache-airflow
# Airflow installation
sudo python3 -m pip install apache-airflow[crypto,s3,postgres]
sudo -H python3 -m pip install six==1.10.0
sudo python3 -m pip install --upgrade six
sudo python3 -m pip install markupsafe
sudo python3 -m pip install --upgrade MarkupSafe
echo 'export PATH=/usr/local/bin:$PATH' >> /root/.bashrc
echo 'export AIRFLOW_HOME=/home/ec2-user/short_sale_volume/airflow' >> /root/.bashrc
source /root/.bashrc
#
echo 'export PATH=/usr/local/bin:$PATH' >> /home/ec2-user/.bashrc
echo 'export AIRFLOW_HOME=/home/ec2-user/short_sale_volume/airflow' >> /home/ec2-user/.bashrc
#
# Update configuration files
cp /home/ec2-user/short_sale_volume/airflow/config.cfg.default /home/ec2-user/short_sale_volume/airflow/config.cfg
# Use | as delimiter instead of / because OutputDBHost may have forward slashes
sed -i -e '/DB_HOST=/ s|=.*|=${OutputDBHost}|' /home/ec2-user/short_sale_volume/airflow/config.cfg
sed -i -e '/API_KEY=/ s|=.*|=${QuandlAPIKey}|' /home/ec2-user/short_sale_volume/airflow/config.cfg
sed -i -e '/AWS_ACCESS_KEY_ID=/ s|=.*|=${AWSAccessKeyID}|' /home/ec2-user/short_sale_volume/airflow/config.cfg
sed -i -e '/AWS_SECRET_ACCESS_KEY=/ s|=.*|=${AWSSecretAccessKey}|' /home/ec2-user/short_sale_volume/airflow/config.cfg
sed -i -e '/REGION_NAME=/ s|=.*|=${AWS::Region}|' /home/ec2-user/short_sale_volume/airflow/config.cfg
sed -i -e '/STOCK_LIMITS=/ s|=.*|=${StockLimits}|' /home/ec2-user/short_sale_volume/airflow/config.cfg
sed -i -e '/STOCKS=/ s|=.*|=${Stocks}|' /home/ec2-user/short_sale_volume/airflow/config.cfg
sed -i -e '/EMR_NUM_CORE_NODES=/ s|=.*|=${EMRNumCoreNodes}|' /home/ec2-user/short_sale_volume/airflow/config.cfg
sed -i -e '/EMR_CORE_NODE_INSTANCE_TYPE=/ s|=.*|=${EMRCoreNodeInstanceType}|' /home/ec2-user/short_sale_volume/airflow/config.cfg
sed -i -e '/VPC_ID=/ s|=.*|=${VPCId}|' /home/ec2-user/short_sale_volume/airflow/config.cfg
sed -i -e '/SUBNET_ID=/ s|=.*|=${SubnetId}|' /home/ec2-user/short_sale_volume/airflow/config.cfg
#
# Initialize Airflow
airflow initdb
#
# Update the database connection in the Airflow Config file
sed -i '/sql_alchemy_conn/s/^/#/g' /home/ec2-user/short_sale_volume/airflow/airflow.cfg
sed -i '/sql_alchemy_conn/ a sql_alchemy_conn = postgresql://airflow:${AirflowDBPassword}@127.0.0.1:5432/airflowdb' /home/ec2-user/short_sale_volume/airflow/airflow.cfg
# Update the type of executor in the Airflow Config file
sed -i '/executor = SequentialExecutor/s/^/#/g' /home/ec2-user/short_sale_volume/airflow/airflow.cfg
sed -i '/executor = SequentialExecutor/ a executor = LocalExecutor' /home/ec2-user/short_sale_volume/airflow/airflow.cfg
# Hide examples
sed -i '/load_examples = True/s/^/#/g' /home/ec2-user/short_sale_volume/airflow/airflow.cfg
sed -i '/load_examples = True/ a load_examples = False' /home/ec2-user/short_sale_volume/airflow/airflow.cfg
# Setup webserver log
mkdir -p /home/ec2-user/log
touch /home/ec2-user/log/access.log
chmod 777 /home/ec2-user/log
chmod 777 /home/ec2-user/log/access.log
sed -i -e '/access_logfile/ s/=.*/=\/home\/ec2-user\/log\/access\.log/' /home/ec2-user/short_sale_volume/airflow/airflow.cfg
#
airflow initdb
chmod -R 777 /home/ec2-user/short_sale_volume
# Run Airflow webserver
airflow webserver -D
sudo -u ec2-user -s source /home/ec-user/.bashrc
sudo -u ec2-user -s aws configure set aws_access_key_id ${AWSAccessKeyID}
sudo -u ec2-user -s aws configure set aws_secret_access_key ${AWSSecretAccessKey}
sudo -u ec2-user -s aws configure set default.region ${AWS::Region}
airflow scheduler -D
Metadata:
AWS::CloudFormation::Init:
configSets:
install:
- gcc
gcc:
packages:
yum:
gcc: []
DependsOn:
- AirflowEC2SecurityGroup
AirflowEC2SecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupName: AirflowEC2SG
GroupDescription: Enable HTTP access via port 80 + SSH access
VpcId: !Ref 'VPCId'
SecurityGroupIngress:
- IpProtocol: tcp
FromPort: 80
ToPort: 80
CidrIp: 0.0.0.0/0
- IpProtocol: tcp
FromPort: 8080
ToPort: 8080
CidrIp: 0.0.0.0/0
- IpProtocol: tcp
FromPort: 22
ToPort: 22
CidrIp: 0.0.0.0/0
EC2Role:
Type: AWS::IAM::Role
Properties:
RoleName: AirflowInstanceRole
AssumeRolePolicyDocument:
Version: "2012-10-17"
Statement:
-
Effect: "Allow"
Principal:
Service:
- "ec2.amazonaws.com"
Action:
- "sts:AssumeRole"
ManagedPolicyArns:
- arn:aws:iam::aws:policy/AmazonS3FullAccess
- arn:aws:iam::aws:policy/AmazonElasticMapReduceFullAccess
EC2InstanceProfile:
Type: AWS::IAM::InstanceProfile
Properties:
InstanceProfileName: AirflowInstanceProfile
Roles:
-
Ref: EC2Role
Outputs:
AirflowEC2PublicDNSName:
Description: Public DNS Name of the Airflow EC2 instance
Value: !Join ["", ["http://", !GetAtt EC2Instance.PublicDnsName, ":8080"]]