--- # Load the cluster name from the environment so that it matches the variable used to lookup the provisioned cluster. name: "{{ lookup('env', 'CLUSTER_NAME') }}" region: 'us-east-1' instance_groups: master: num_instances: 1 type: m4.large market: ON_DEMAND core: num_instances: 1 type: m4.large market: ON_DEMAND task: num_instances: "{{ lookup('env', 'TASK_INSTANCE_NUM')|default(2, true)|int }}" type: "{{ lookup('env', 'TASK_INSTANCE_TYPE')|default('m4.large', true) }}" market: ON_DEMAND role: EMR_EC2_DefaultRole keypair_name: 'analytics' # your analytics instance key pair name vpc_subnet_id: 'subnet-SET ME' # public subnet of the VPC log_uri: "{{ lookup('env', 'TASK_CONFIGURATION_S3_BUCKET') }}/logs" release_label: 'emr-4.7.2' # use emr-4.9.6 if SigV4 support is required applications: - name: Hadoop - name: Hive - name: Sqoop-Sandbox - name: Ganglia configurations: - classification: mapred-site properties: mapreduce.framework.name: 'yarn' mapreduce.jobtracker.retiredjobs.cache.size: '50' mapreduce.reduce.shuffle.input.buffer.percent: '0.20' - classification: yarn-site properties: yarn.resourcemanager.max-completed-applications: '5' # In EMR 4.7, cross-region S3 access does not work correctly without this # property. Should be fixed in EMR 5. # See http://stackoverflow.com/questions/38710637/running-emr-example-getting-301-error # - classification: core-site # properties: # fs.s3n.endpoint: "s3.amazonaws.com" # https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hadoop-script.html) steps: - type: hive_install jar: "{{ lookup('env', 'SCRIPT_RUNNER_JAR_S3_URL') }}" - type: script name: "Install Sqoop" jar: "{{ lookup('env', 'SCRIPT_RUNNER_JAR_S3_URL') }}" step_args: - "{{ lookup('env', 'TASK_CONFIGURATION_S3_BUCKET') }}/install-sqoop" - "{{ lookup('env', 'TASK_CONFIGURATION_S3_BUCKET') }}" action_on_failure: TERMINATE_JOB_FLOW # Set to CANCEL_AND_WAIT when debugging step failures.