diff --git a/Sparking_Water_EMR_instructions.md b/Sparking_Water_EMR_instructions.md index 1821ee6c..3083feb4 100644 --- a/Sparking_Water_EMR_instructions.md +++ b/Sparking_Water_EMR_instructions.md @@ -6,8 +6,7 @@ Run thes # Log in as Hadoop user and make ec2-user directories # If you don't do this step your Spark code will immediately fail with permission issues - sudo su - su hadoop + sudo su hadoop hadoop fs -mkdir -p /user/ec2-user hadoop fs -chown ec2-user /user/ec2-user @@ -29,4 +28,26 @@ Run thes export MASTER="yarn-client" # Start up pysparking - /home/ec2-user/sparkling-water-1.6.8/bin/pysparkling \ No newline at end of file + /home/ec2-user/sparkling-water-1.6.8/bin/pysparkling --deploy-mode client + + # Note the shell's tracking URL, which will look something like this: + http://ip-10-0-0-123.ec2.internal:20888/proxy/application_1477154041215_0004/ + + # Open up a new Terminal tab. We're doing to do port forwarding / ssh tunneling to view the Spark UI + ssh -i /Users/ryanzotti/Documents/private_keys/ML.pem -L 20888:[localhost]:20888 ec2-user@54.146.60.80 + + # Open up your web browser to the tracking URL. Replace the IP with localhost + http://localhost:20888/proxy/application_1477154041215_0004/ + + +## FAQ + +**Question:** You get a never-ending stream of: + + Client: Application report for application_1477154041215_0013 (state: ACCEPTED) + +**Answer:** You probably have too many simultaneous (potentially abandoned) spark shells running. + + ps -ef | grep -i spark + kill -9 +