From a726c660c66c33ac82b735e14881b6d6d72cd2f2 Mon Sep 17 00:00:00 2001
From:  <>
Date: Mon, 8 Jul 2024 21:12:01 +0000
Subject: [PATCH] Deployed a089879 with MkDocs version: 1.1.2

---
 ASP2024/08-Mandlebrot/index.html |   3 ++-
 search/search_index.json         |   2 +-
 sitemap.xml                      |  42 +++++++++++++++----------------
 sitemap.xml.gz                   | Bin 365 -> 365 bytes
 4 files changed, 24 insertions(+), 23 deletions(-)
diff --git a/ASP2024/08-Mandlebrot/index.html b/ASP2024/08-Mandlebrot/index.html
index efa347c..c11d281 100644
--- a/ASP2024/08-Mandlebrot/index.html
+++ b/ASP2024/08-Mandlebrot/index.html
@@ -609,7 +609,8 @@ <h1 id="a-brief-detour-through-the-mandlebrot-set">A brief detour through the Ma
 <p>We have a simple program that can draw the Mandlebrot set. It's called <code>goatbrot</code>.</p>
 <h2 id="downloading-the-needed-executables">Downloading the needed executables</h2>
 <p>Since your training VMs don't have the goatbrot executable needed for this exercise, we will need to download it first. Execute the following commands to do this:</p>
-<pre><code>$ mkdir bin
+<pre><code>$ source /cvmfs/sft.cern.ch/lcg/views/setupViews.sh LCG_105a x86_64-ubuntu2204-gcc11-opt
+$ mkdir bin
 $ cd bin
 $ wget https://www.nhn.ou.edu/~hs/tmp/goatbrot
 $ chmod +x goatbrot
diff --git a/search/search_index.json b/search/search_index.json
index ca2bef3..2186369 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"DOSAR Outreach Projects The Distributed Organization for Scientific and Academic Research (DOSAR) is a 'grass-roots' cyberinfrastructure organization that focuses on community and campus based cyberinfrastructure and promotes a wide range of interdisciplinary and educational activities within the organization and its member institutions. Ongoing Events CODATA-RDA School of Research Data Science - December 2-13, 2019 - San Jose, Costa Rica Upcoming Events CODATA-RDA School of Research Data Science - January 2020, Petoria, South Africa Past Events CODATA-RDA School of Research Data Science - August 5-16, 2019 - Trieste, Italy CODATA-RDA School of Research Data Science - December 3-14, 2018 - Sao Paulo, Brazil CODATA-RDA School of Research Data Science - October 22 - November 2, 2018 - Kigali, Rwanda CODATA-RDA School of Research Data Science - August 6-17, 2018 - Trieste, Italy The African School of Physics (ASP) - June 24-July 14, 2018 - UNAM, Windhoek, Namibia CODATA-RDA School of Research Data Science - December 4-15, 2017 - Sao Paulo, Brazil Polar Hackathon - August 1-4, 2017 - Stony Brook, NY CODATA-RDA School of Research Data Science - July 10-21, 2017 - Trieste, Italy","title":"Home"},{"location":"#dosar-outreach-projects","text":"The Distributed Organization for Scientific and Academic Research (DOSAR) is a 'grass-roots' cyberinfrastructure organization that focuses on community and campus based cyberinfrastructure and promotes a wide range of interdisciplinary and educational activities within the organization and its member institutions.","title":"DOSAR Outreach Projects"},{"location":"#ongoing-events","text":"CODATA-RDA School of Research Data Science - December 2-13, 2019 - San Jose, Costa Rica","title":"Ongoing Events"},{"location":"#upcoming-events","text":"CODATA-RDA School of Research Data Science - January 2020, Petoria, South Africa","title":"Upcoming Events"},{"location":"#past-events","text":"CODATA-RDA School of Research Data Science - August 5-16, 2019 - Trieste, Italy CODATA-RDA School of Research Data Science - December 3-14, 2018 - Sao Paulo, Brazil CODATA-RDA School of Research Data Science - October 22 - November 2, 2018 - Kigali, Rwanda CODATA-RDA School of Research Data Science - August 6-17, 2018 - Trieste, Italy The African School of Physics (ASP) - June 24-July 14, 2018 - UNAM, Windhoek, Namibia CODATA-RDA School of Research Data Science - December 4-15, 2017 - Sao Paulo, Brazil Polar Hackathon - August 1-4, 2017 - Stony Brook, NY CODATA-RDA School of Research Data Science - July 10-21, 2017 - Trieste, Italy","title":"Past Events"},{"location":"ASP2018/","text":"Placeholder for ASP 2018 Lessons","title":"Index"},{"location":"ASP2018/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6 manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes. Where you will work Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"High Throughput Computing and Condor Introduction"},{"location":"ASP2018/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"ASP2018/01-Introduction/#preliminaries","text":"You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6 manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"ASP2018/01-Introduction/#which-condor","text":"We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes.","title":"Which Condor?"},{"location":"ASP2018/01-Introduction/#where-you-will-work","text":"Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"ASP2018/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"The Exercises"},{"location":"ASP2018/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password previously. If you don't know them, talk to us. Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.9 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.9 (Carbon) Release: 6.9 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor dead.letter execute/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_collector. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:26:20 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"ASP2018/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"ASP2018/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"ASP2018/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password previously. If you don't know them, talk to us.","title":"Login to the Condor submit computer"},{"location":"ASP2018/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux.","title":"Looking at our Condor installation"},{"location":"ASP2018/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.9 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.9 (Carbon) Release: 6.9 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor dead.letter execute/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_collector. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"ASP2018/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:26:20 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"ASP2018/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"ASP2018/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"ASP2018/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"ASP2018/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"ASP2018/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Let's create a directory to perform the condor tests. Execute the following two commands in your Linux shell: $ mkdir -p ~/condor-test $ cd ~/condor-test Next, create a file called simple.c using your favorite editor. Use your preferred text editor to create this C program. If you don't have one, use 'nano', as shown below. The file will be in the condor-test directory. In that file, put the following text. Copy and paste is a good choice: $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 2056. Now, watch your job run (insert your username in the command below instead of YOUR_USER_ID . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (2056.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (2056.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (2056.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (2056.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 34. $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q YOUR_USERID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"ASP2018/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"ASP2018/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"ASP2018/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Let's create a directory to perform the condor tests. Execute the following two commands in your Linux shell: $ mkdir -p ~/condor-test $ cd ~/condor-test Next, create a file called simple.c using your favorite editor. Use your preferred text editor to create this C program. If you don't have one, use 'nano', as shown below. The file will be in the condor-test directory. In that file, put the following text. Copy and paste is a good choice: $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"ASP2018/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 2056. Now, watch your job run (insert your username in the command below instead of YOUR_USER_ID . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (2056.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (2056.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (2056.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (2056.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"ASP2018/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 34. $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q YOUR_USERID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"ASP2018/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"ASP2018/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"ASP2018/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"ASP2018/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"ASP2018/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"ASP2018/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 23.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 23.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 23.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 23.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 23.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 23.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 23.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 23.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history YOUR_USER_ID For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"ASP2018/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"ASP2018/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"ASP2018/04-TipsandTricks/#tips-for-condor_q","text":"Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 23.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 23.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 23.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 23.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 23.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 23.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 23.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 23.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"ASP2018/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"ASP2018/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history YOUR_USER_ID For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"ASP2018/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you would like to a write program in the Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"ASP2018/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"ASP2018/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you would like to a write program in the Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"ASP2018/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"ASP2018/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask us. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. You will need to include the following line in your submit file (before the \"queue\" statement) to make sure that Condor looks for a resource that uses OASIS: requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) queue","title":"Running a job with R"},{"location":"ASP2018/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"ASP2018/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"ASP2018/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"ASP2018/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"ASP2018/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"ASP2018/06-RJob/#setup","text":"OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask us. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"ASP2018/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. You will need to include the following line in your submit file (before the \"queue\" statement) to make sure that Condor looks for a resource that uses OASIS: requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"ASP2018/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) queue","title":"The answer"},{"location":"ASP2018/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"ASP2018/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"ASP2018/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"ASP2018/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"ASP2018/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"ASP2018/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot . A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of YOUR_USER_ID ): $ scp YOUR_USER_ID@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run each goatbroat in parallel in our cluster. Here's an example you can run by hand. (This is back in your first terminal, where you are logged into the osgconnect machine.) Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Copy file to your public area: cp mandle.gif ~/public 1. This time, rather than copying your file to your local system, point your browser at the stash web server: http://stash.osgconnect.net/~YOUR_USER_ID","title":"A brief detour through the Mandlebrot set"},{"location":"ASP2018/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot .","title":"A brief detour through the Mandlebrot set"},{"location":"ASP2018/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of YOUR_USER_ID ): $ scp YOUR_USER_ID@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"ASP2018/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run each goatbroat in parallel in our cluster. Here's an example you can run by hand. (This is back in your first terminal, where you are logged into the osgconnect machine.) Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"ASP2018/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Copy file to your public area: cp mandle.gif ~/public 1. This time, rather than copying your file to your local system, point your browser at the stash web server: http://stash.osgconnect.net/~YOUR_USER_ID","title":"Try it!"},{"location":"ASP2018/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file (called submit ) that has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . JOB simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue. As before, replace YOUR_USER_ID with your username. (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q YOUR_USER_ID -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here? Challenge What is the scheduler universe? Why does DAGMan use it?","title":"Coordinating set of jobs: A simple DAG"},{"location":"ASP2018/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"ASP2018/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"ASP2018/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"ASP2018/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file (called submit ) that has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . JOB simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue. As before, replace YOUR_USER_ID with your username. (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q YOUR_USER_ID -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"ASP2018/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"ASP2018/09-SimpleDAG/#challenge","text":"What is the scheduler universe? Why does DAGMan use it?","title":"Challenge"},{"location":"ASP2018/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot goatbrot1.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh +ProjectName = \"osg.ConnectTrain\" arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue wrapper_montage.sh Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash module load imagemagick montage $* Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch -n 10 condor_q YOUR_USER_ID -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~YOUR_USER_ID . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"ASP2018/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"ASP2018/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"ASP2018/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot","title":"Make your job submission files"},{"location":"ASP2018/10-ComplexDAG/#goatbrot1sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"ASP2018/10-ComplexDAG/#goatbrot2sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"ASP2018/10-ComplexDAG/#goatbrot3sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"ASP2018/10-ComplexDAG/#goatbrot4sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"ASP2018/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh +ProjectName = \"osg.ConnectTrain\" arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue","title":"montage.sub"},{"location":"ASP2018/10-ComplexDAG/#wrapper_montagesh","text":"Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash module load imagemagick montage $*","title":"wrapper_montage.sh"},{"location":"ASP2018/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"ASP2018/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"ASP2018/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch -n 10 condor_q YOUR_USER_ID -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~YOUR_USER_ID . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"ASP2018/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"ASP2018/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this: universe = vanilla executable = wrapper_montage.sh +ProjectName = \"osg.ConnectTrain\" arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh +ProjectName = \"osg.ConnectTrain\" arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up. Challenge If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Handling a DAG that fails"},{"location":"ASP2018/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"ASP2018/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this: universe = vanilla executable = wrapper_montage.sh +ProjectName = \"osg.ConnectTrain\" arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh +ProjectName = \"osg.ConnectTrain\" arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"ASP2018/11-HandlingFailure/#challenge","text":"If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Challenge"},{"location":"ASP2018/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"ASP2018/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"ASP2018/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"ASP2018/12-VariableSubstitution/#declare-your-variables","text":"First declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"ASP2018/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"ASP2018/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"ASP2018/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"ASP2018/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"ASP2018/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"ASP2018/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"ASP2018/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"ASP2018/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"ASP2018/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"ASP2018/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"ASP2018/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"ASP2018/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"ASP2018/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"ASP2018/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"ASP2018/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"ASP2018/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"ASP2018/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"ASP2018/ASP2018_Materials/","text":"ASP 2018 Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Tuesday Morning - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Tuesday Afternoon - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Tuesday Afternoon - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Wednesday Morning - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Wednesday Morning - Computational Infrastructures - Session 5 ATLAS Analysis Example Lecture ATLAS Analysis Example Exercise Wednesday Afternoon - Computational Infrastructures - Session 6 Computational Infrastructures Wrap Up - Slides Contact information Rob Quick - rquick@iu.edu Horst Severini - severini@ou.edu Patrick Skubic - pskubic@ou.edu Julia Gray - julia.ann.gray@gmail.com Jae Yu - jaehoonyu@uta.edu Dick Greenwood - greenw@phys.latech.edu Chris Walker - walker@nhn.ou.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the African School of Physics 2018. If you want long\u2010term grid access, you can go to http://www.osgconnect.net/ and sign up","title":"ASP 2018 Material"},{"location":"ASP2018/ASP2018_Materials/#asp-2018-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"ASP 2018 Materials"},{"location":"ASP2018/ASP2018_Materials/#tuesday-morning-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Tuesday Morning - Computational Infrastructures - Session 1"},{"location":"ASP2018/ASP2018_Materials/#tuesday-afternoon-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Tuesday Afternoon - Computational Infrastructures - Session 2"},{"location":"ASP2018/ASP2018_Materials/#tuesday-afternoon-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Tuesday Afternoon - Computational Infrastructures - Session 3"},{"location":"ASP2018/ASP2018_Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"ASP2018/ASP2018_Materials/#wednesday-morning-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Wednesday Morning - Computational Infrastructures - Session 4"},{"location":"ASP2018/ASP2018_Materials/#wednesday-morning-computational-infrastructures-session-5","text":"ATLAS Analysis Example Lecture ATLAS Analysis Example Exercise","title":"Wednesday Morning - Computational Infrastructures - Session 5"},{"location":"ASP2018/ASP2018_Materials/#wednesday-afternoon-computational-infrastructures-session-6","text":"Computational Infrastructures Wrap Up - Slides","title":"Wednesday Afternoon - Computational Infrastructures - Session 6"},{"location":"ASP2018/ASP2018_Materials/#contact-information","text":"Rob Quick - rquick@iu.edu Horst Severini - severini@ou.edu Patrick Skubic - pskubic@ou.edu Julia Gray - julia.ann.gray@gmail.com Jae Yu - jaehoonyu@uta.edu Dick Greenwood - greenw@phys.latech.edu Chris Walker - walker@nhn.ou.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the African School of Physics 2018. If you want long\u2010term grid access, you can go to http://www.osgconnect.net/ and sign up","title":"Contact information"},{"location":"ASP2018/ASP2018_Schedule/","text":"ASP 2018 High-Level Curriculum Overview Tuesday AM Session Welcome and Background The Landscape of Research Computing Tuesday PM Session Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Wednesday AM Session Brief Introduction to clouds and containers ATLAS Analysis Example Wednesday PM Session Close out and resources for further collaboration Detailed Schedule Tuesday 10-July 2018 Time Description Instructor 10:50 Welcome and the Landscape of Research Computing Pat Skubic 11:35 Exercise - UNIX Refresher, Running simple Condor jobs All 12:20 Lunch 13:40 Profiling your application and finding a home for your workflow Julia Gray 14:25 Exercise - Running scripting and R jobs with Condor All 15:10 Coffee Break 15:30 Workflows and distributed environments Horst Severini 16:15 Exercise - DAGMan All Wednesday 11-July 2018 Time Description Instructor 09:00 A Brief Introduction to Clouds and Containers Horst Severini 09:45 Exercise - Using Containers on OSG and Discpline Specfic Tutorials All 10:30 Coffee Break 10:50 ATLAS Analysis Example Pat Skubic 11:35 Exercises All 12:20 Lunch 13:40 Closing Thoughts Pat Skubic Materials Materials Page","title":"ASP 2018 Schedule"},{"location":"ASP2018/ASP2018_Schedule/#asp-2018","text":"","title":"ASP 2018"},{"location":"ASP2018/ASP2018_Schedule/#high-level-curriculum-overview","text":"","title":"High-Level Curriculum Overview"},{"location":"ASP2018/ASP2018_Schedule/#tuesday-am-session","text":"Welcome and Background The Landscape of Research Computing","title":"Tuesday AM Session"},{"location":"ASP2018/ASP2018_Schedule/#tuesday-pm-session","text":"Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Tuesday PM Session"},{"location":"ASP2018/ASP2018_Schedule/#wednesday-am-session","text":"Brief Introduction to clouds and containers ATLAS Analysis Example","title":"Wednesday AM Session"},{"location":"ASP2018/ASP2018_Schedule/#wednesday-pm-session","text":"Close out and resources for further collaboration","title":"Wednesday PM Session"},{"location":"ASP2018/ASP2018_Schedule/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"ASP2018/ASP2018_Schedule/#tuesday-10-july-2018","text":"Time Description Instructor 10:50 Welcome and the Landscape of Research Computing Pat Skubic 11:35 Exercise - UNIX Refresher, Running simple Condor jobs All 12:20 Lunch 13:40 Profiling your application and finding a home for your workflow Julia Gray 14:25 Exercise - Running scripting and R jobs with Condor All 15:10 Coffee Break 15:30 Workflows and distributed environments Horst Severini 16:15 Exercise - DAGMan All","title":"Tuesday 10-July 2018"},{"location":"ASP2018/ASP2018_Schedule/#wednesday-11-july-2018","text":"Time Description Instructor 09:00 A Brief Introduction to Clouds and Containers Horst Severini 09:45 Exercise - Using Containers on OSG and Discpline Specfic Tutorials All 10:30 Coffee Break 10:50 ATLAS Analysis Example Pat Skubic 11:35 Exercises All 12:20 Lunch 13:40 Closing Thoughts Pat Skubic","title":"Wednesday 11-July 2018"},{"location":"ASP2018/ASP2018_Schedule/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2018/AnalysisExample/","text":"ATLAS Analysis Example Introduction Root may be run in batch mode on the grid to analyze large data samples. This example creates simulated data in root format using trees and performs analysis on the simulated data by means of processing on the grid. This example is based on a demo developed by OU programmer Chris Walker. Prerequisite Login on submission node $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net Make a directory for this exercise $ mkdir -p analysis_example $ cd analysis_example Again the $ sign at the beginning of the commands to execute is the command prompt , so it should not be entered as part of the command. Simple Analysis Example Step 1: Create simulated data using the grid Now in your test directory on the submission host we will create the three files: run-root.cmd , run-root.sh , and run-root.C with the contents given below. This may require running an editor such as emacs on your local desktop and then copying the created files to the submission host. Or the nano editor can be run directly on the submission host. A typical copy command would be as follows. $ scp run-root.* YOUR_USER_ID@user-training.osgconnect.net:analysis_example/ It is probably easier to create all scripts with nano on the submission node, though, and then you won't have to copy ( scp ) anything at all. So everything below assumes you are logged on to a terminal session on the submission node. First, we will utilize a simple command script to submit the grid jobs. It is run-root.cmd : universe=vanilla executable=run-root.sh transfer_input_files = run-root.C transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root.log transfer_output_files = root.out,t00.root,t01.root output=run-root.out.$(Cluster).$(Process) error=run-root.err.$(Cluster).$(Process) notification=Never queue Note that the executable script is: run-root.sh which is as follows: #!/bin/bash source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load root module load libXpm root -b < run-root.C > root.out This script runs Root in batch mode and executes input macro run-root.C and produces output that is routed to file root.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-root.sh The macro run-root.C consists of the following code: { // create files containing simulated data TRandom g; char c[256]; for ( int j = 0 ; j < 2 ; j++ ){ sprintf(c,\"t%2.2d.root\\000\",j); TFile f(c,\"RECREATE\",\"MyFile\", 0/*no compression*/); TTree *t = new TTree(\"t0\",\"t0\"); Int_t Run; TBranch * b_Run = t->Branch(\"Run\",&Run); Int_t Event; TBranch * b_Event = t->Branch(\"Event\",&Event); Float_t Energy; TBranch * b_Energy = t->Branch(\"Energy\",&Energy); Run = j; for( Event = 0 ; Event < 100 ; Event++ ){ Energy = g.Gaus(500.0 , 200.0); t->Fill(); } f.Write(); f.Close(); } } .q The grid job can be submitted using: $ condor_submit run-root.cmd It can be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-root.log , and output file: root.out , and the files containing the simulated data: t00.root , t01.root in your test directory. You need to copy these files into your public directory, so that you can download it to your local desktop: $ cp t0*.root ~/public/ Now open a different terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/t00.root http://stash.osgconnect.net/~YOUR_USER_ID/t01.root You can then inspect the contents of t00.root and t01.root by running Root in your current directory in the local terminal window in which you just ran the wget command: $ root t00.root And then the Root command: TBrowser b With the TBrowser you can plot the simulated data in branch Energy as well as the other branches. Double click on the name of the root files, and then on the variables you would like to plot. Each data file contains a TTree named t0 . You can plot the contents of all (in this example both) data file TTree's by using the TChain method as follows: In Root execute the following commands: TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Draw(\"Energy\"); When you are done with this, you can quit root again with the command .q <Return> . Step 2: Analyze Real Data Now we want to have a look at a real live ATLAS root file. For this, go back to the remote terminal window on osgconnect. You will need a new condor submit script called run-z.cmd : universe=vanilla executable=run-z.sh transfer_input_files = readEvents.C,/home/pskubic/public/muons.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-z.log transfer_output_files = root-z.out,histograms-z.root output=run-z.out.$(Cluster).$(Process) error=run-z.err.$(Cluster).$(Process) notification=Never queue The new executable script you need for this job is: run-z.sh which is as follows: #!/bin/bash source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load root module load libXpm root -b -q readEvents.C+ > root-z.out This script runs Root in batch mode and executes input macro readEvents.C and produces output that is routed to file root-z.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-z.sh The macro readEvents.C consists of the following code: #include \"TFile.h\" #include \"TTree.h\" #include \"TCanvas.h\" #include \"TH1F.h\" #include \"iostream\" //#include \"TLorentzVector.h\" using namespace std; void readEvents(){ // load the ROOT ntuple file TFile * f = new TFile(\"muons.root\"); TTree *tree = (TTree *) f->Get(\"POOLCollectionTree\"); int nEntries = tree->GetEntries(); cout << \"There are \" << nEntries << \" entries in your ntuple\" << endl; // create local variables for the tree's branches UInt_t NLooseMuons; Float_t LooseMuonsEta1; Float_t LooseMuonsPhi1; Float_t LooseMuonsPt1; Float_t LooseMuonsEta2; Float_t LooseMuonsPhi2; Float_t LooseMuonsPt2; // set the tree's branches to the local variables tree->SetBranchAddress(\"NLooseMuon\", &NLooseMuons); tree->SetBranchAddress(\"LooseMuonEta1\", &LooseMuonsEta1); tree->SetBranchAddress(\"LooseMuonPhi1\", &LooseMuonsPhi1); tree->SetBranchAddress(\"LooseMuonPt1\", &LooseMuonsPt1); tree->SetBranchAddress(\"LooseMuonEta2\", &LooseMuonsEta2); tree->SetBranchAddress(\"LooseMuonPhi2\", &LooseMuonsPhi2); tree->SetBranchAddress(\"LooseMuonPt2\", &LooseMuonsPt2); // declare some histograms TH1F *muPt1 = new TH1F(\"muPt1\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx1 = new TH1F(\"muPx1\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy1 = new TH1F(\"muPy1\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz1 = new TH1F(\"muPz1\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta1 = new TH1F(\"muEta1\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi1 = new TH1F(\"muPhi1\", \";#phi;Events\", 50, -4, 4); TH1F *muE1 = new TH1F(\"muE1\", \";Energy;Events\", 50, 0, 200); TH1F *muPt2 = new TH1F(\"muPt2\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx2 = new TH1F(\"muPx2\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy2 = new TH1F(\"muPy2\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz2 = new TH1F(\"muPz2\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta2 = new TH1F(\"muEta2\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi2 = new TH1F(\"muPhi2\", \";#phi;Events\", 50, -4, 4); TH1F *muE2 = new TH1F(\"muE2\", \";Energy;Events\", 50, 0, 200); TH1F *zPt = new TH1F(\"zPt\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *zPx = new TH1F(\"zPx\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *zPy = new TH1F(\"zPy\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *zPz = new TH1F(\"zPz\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz //TH1F *zEta = new TH1F(\"zEta\", \";#eta;Events\", 50, -3, 3); //TH1F *zPhi = new TH1F(\"zPhi\", \";#phi;Events\", 50, -4, 4); TH1F *zE = new TH1F(\"zE\", \";Energy;Events\", 50, 0, 200); TH1F *zMass = new TH1F(\"zMass\", \";Mass;Events\", 50, 0, 200); // loop over each entry (event) in the tree for( int entry=0; entry < nEntries; entry++ ){ if( entry%10000 * 0 ) cout << \"Entry:\" << entry << endl; // check that the event is read properly int entryCheck = tree->GetEntry( entry ); if( entryCheck <= 0 ){ continue; } // only look at events containing at least 2 leptons if(NLooseMuons < 2) continue; // require the leptons to have some transverse momentum if(abs(LooseMuonsPt1) *0.001 < 20 || abs(LooseMuonsPt2) *0.001 < 20 ) continue; // make a LorentzVector from the muon //TLorentzVector Muons1; // Muons1.SetPtEtaPhiM(fabs(LooseMuonsPt1), LooseMuonsEta1, LooseMuonsPhi1, 0); // print out the details of an electron every so often if( entry%10000 * 0 ){ cout << \"Muons pt1: \" << LooseMuonsPt1 << \" eta: \" << LooseMuonsEta1 << \" phi \" << LooseMuonsPhi1 << endl; cout << \"Muons pt2: \" << LooseMuonsPt2 << \" eta: \" << LooseMuonsEta2 << \" phi \" << LooseMuonsPhi2 << endl; } //calculation of muon energy Double_t muonMass = 0.0; // assume the mass of the muon is negligible Double_t muonPx1 = abs(LooseMuonsPt1)*cos(LooseMuonsPhi1); Double_t muonPy1 = abs(LooseMuonsPt1)*sin(LooseMuonsPhi1); Double_t muonPz1 = abs(LooseMuonsPt1)*sinh(LooseMuonsEta1); Double_t muonEnergy1 = sqrt (muonPx1*muonPx1 + muonPy1*muonPy1 + muonPz1*muonPz1 + muonMass*muonMass); Double_t muonPx2 = abs(LooseMuonsPt2)*cos(LooseMuonsPhi2); Double_t muonPy2 = abs(LooseMuonsPt2)*sin(LooseMuonsPhi2); Double_t muonPz2 = abs(LooseMuonsPt2)*sinh(LooseMuonsEta2); Double_t muonEnergy2 = sqrt (muonPx2*muonPx2 + muonPy2*muonPy2 + muonPz2*muonPz2 + muonMass*muonMass); Double_t zCompX = muonPx1 + muonPx2; Double_t zCompY = muonPy1 + muonPy2; Double_t zLongi = muonPz1 + muonPz2; Double_t zPerp = sqrt (zCompX*zCompX + zCompY*zCompY); Double_t zEnergy = muonEnergy1 + muonEnergy2; Double_t zM = sqrt (zEnergy*zEnergy -zCompX*zCompX -zCompY*zCompY -zLongi*zLongi); // fill our histograms muPt1->Fill((LooseMuonsPt1)*0.001); // in GeV muEta1->Fill(LooseMuonsEta1); muPhi1->Fill(LooseMuonsPhi1); muPx1->Fill( muonPx1*0.001); // in GeV muPy1->Fill( muonPy1*0.001); // in GeV muPz1->Fill( muonPz1*0.001); // in GeV muE1->Fill(muonEnergy1*0.001); // in GeV muPt2->Fill((LooseMuonsPt2)*0.001); // in GeV muEta2->Fill(LooseMuonsEta2); muPhi2->Fill(LooseMuonsPhi2); muPx2->Fill( muonPx2*0.001); // in GeV muPy2->Fill( muonPy2*0.001); // in GeV muPz2->Fill( muonPz2*0.001); // in GeV muE2->Fill(muonEnergy2*0.001); // in GeV zPt->Fill( zPerp*0.001); // in GeV zPx->Fill( zCompX*0.001); // in GeV zPy->Fill( zCompY*0.001); // in GeV zPz->Fill( zLongi*0.001); // in GeV zE->Fill( zEnergy*0.001); // in GeV zMass->Fill(zM*0.001); // in GeV } // draw the eta distribution zMass->Draw(); // make a ROOT output file to store your histograms TFile *outFile = new TFile(\"histograms-z.root\", \"recreate\"); muPt1->Write(); muEta1->Write(); muPhi1->Write(); muE1->Write(); muPx1->Write(); muPy1->Write(); muPz1->Write(); muPt2->Write(); muEta2->Write(); muPhi2->Write(); muE2->Write(); muPx2->Write(); muPy2->Write(); muPz2->Write(); zPt->Write(); zE->Write(); zPx->Write(); zPy->Write(); zPz->Write(); zMass->Write(); outFile->Close(); } The grid job can be submitted using: $ condor_submit run-z.cmd It can again be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-z.log , and output file: root-z.out , and the files containing the simulated data: histograms-z.root in your test directory. You again need to copy that file into your public directory, so that you can download it to your local desktop: $ cp histograms-z.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms-z.root You can inspect the contents of histograms-z.root by running Root (i.e., root histograms-z.root ) in your current directory in your local terminal window: $ root histograms-z.root And then using the Root command: TBrowser b With the TBrowser you can plot the variables in the root file. Double click on histograms-z.root , and then on the variables to plot them. Step 3: Make TSelector Now let's go back to the files created in step 1, in the remote terminal window. Start root in your test directory with the following commands: $ module load root $ root -b And then execute the following commands: TFile f(\"t00.root\"); t0->MakeSelector(\"s0\",\"=legacy\"); f.Close(); .q This will create files s0.C and s0.h in your test directory that contain code corresponding to the definition of the TTree t0 . This code can be used to process files containing data is these TTree's. Now we will add a histogram to the TSelector code. Several code lines have to be added to the TSelector code files s0.C and s0.h . To s0.h make the following additions: after existing include statements add: #include <TH1F.h> After class s0 definition: class s0 : public TSelector { public : add TH1F *e; To s0.C make the following additions: After entry: void s0::SlaveBegin(TTree * /*tree*/) { add e = new TH1F(\"e\", \"e\", 1000, -199.0, 1200.0); After Process entry: Bool_t s0::Process(Long64_t entry) { add GetEntry(entry); e->Fill(Energy); After terminate entry: void s0::Terminate() { add TFile f(\"histograms.root\",\"RECREATE\"); f.WriteObject(e,\"Energy\"); f.Close(); Now create the new script files for Step 2: create run-root-2.cmd : universe=vanilla executable=run-root-2.sh transfer_input_files = s0.C,s0.h,run-root-2.C,t00.root,t01.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root-2.log transfer_output_files = root-2.out,histograms.root output=run-root-2.out.$(Cluster).$(Process) error=run-root-2.err.$(Cluster).$(Process) notification=Never queue Create run-root-2.sh : #!/bin/bash source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load root module load libXpm root -b < run-root-2.C > root-2.out It has to be made executable, by use of the chmod Linux command: chmod +x run-root-2.sh Create run-root-2.C .L s0.C++ { //Load and run TSelector s0 *s = new s0(); TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Process(s); } We can test the Root job on the osgconnect training machine by issuing command: root -b < run-root-2.C If this works, we can process the data files t00.root and t01.root on the Grid with our new command script run-root-2.cmd . This can be done with command: condor_submit run-root-2.cmd Once your job has finished, you again need to copy that file into your public directory, so that you can download it to your local desktop: cp histograms.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms.root You can look at the output histogram file: histograms.root with TBrowser b as before, in your local terminal window.","title":"ATLAS Analysis Example"},{"location":"ASP2018/AnalysisExample/#atlas-analysis-example","text":"","title":"ATLAS Analysis Example"},{"location":"ASP2018/AnalysisExample/#introduction","text":"Root may be run in batch mode on the grid to analyze large data samples. This example creates simulated data in root format using trees and performs analysis on the simulated data by means of processing on the grid. This example is based on a demo developed by OU programmer Chris Walker.","title":"Introduction"},{"location":"ASP2018/AnalysisExample/#prerequisite","text":"Login on submission node $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net Make a directory for this exercise $ mkdir -p analysis_example $ cd analysis_example Again the $ sign at the beginning of the commands to execute is the command prompt , so it should not be entered as part of the command.","title":"Prerequisite"},{"location":"ASP2018/AnalysisExample/#simple-analysis-example","text":"","title":"Simple Analysis Example"},{"location":"ASP2018/AnalysisExample/#step-1-create-simulated-data-using-the-grid","text":"Now in your test directory on the submission host we will create the three files: run-root.cmd , run-root.sh , and run-root.C with the contents given below. This may require running an editor such as emacs on your local desktop and then copying the created files to the submission host. Or the nano editor can be run directly on the submission host. A typical copy command would be as follows. $ scp run-root.* YOUR_USER_ID@user-training.osgconnect.net:analysis_example/ It is probably easier to create all scripts with nano on the submission node, though, and then you won't have to copy ( scp ) anything at all. So everything below assumes you are logged on to a terminal session on the submission node. First, we will utilize a simple command script to submit the grid jobs. It is run-root.cmd : universe=vanilla executable=run-root.sh transfer_input_files = run-root.C transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root.log transfer_output_files = root.out,t00.root,t01.root output=run-root.out.$(Cluster).$(Process) error=run-root.err.$(Cluster).$(Process) notification=Never queue Note that the executable script is: run-root.sh which is as follows: #!/bin/bash source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load root module load libXpm root -b < run-root.C > root.out This script runs Root in batch mode and executes input macro run-root.C and produces output that is routed to file root.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-root.sh The macro run-root.C consists of the following code: { // create files containing simulated data TRandom g; char c[256]; for ( int j = 0 ; j < 2 ; j++ ){ sprintf(c,\"t%2.2d.root\\000\",j); TFile f(c,\"RECREATE\",\"MyFile\", 0/*no compression*/); TTree *t = new TTree(\"t0\",\"t0\"); Int_t Run; TBranch * b_Run = t->Branch(\"Run\",&Run); Int_t Event; TBranch * b_Event = t->Branch(\"Event\",&Event); Float_t Energy; TBranch * b_Energy = t->Branch(\"Energy\",&Energy); Run = j; for( Event = 0 ; Event < 100 ; Event++ ){ Energy = g.Gaus(500.0 , 200.0); t->Fill(); } f.Write(); f.Close(); } } .q The grid job can be submitted using: $ condor_submit run-root.cmd It can be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-root.log , and output file: root.out , and the files containing the simulated data: t00.root , t01.root in your test directory. You need to copy these files into your public directory, so that you can download it to your local desktop: $ cp t0*.root ~/public/ Now open a different terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/t00.root http://stash.osgconnect.net/~YOUR_USER_ID/t01.root You can then inspect the contents of t00.root and t01.root by running Root in your current directory in the local terminal window in which you just ran the wget command: $ root t00.root And then the Root command: TBrowser b With the TBrowser you can plot the simulated data in branch Energy as well as the other branches. Double click on the name of the root files, and then on the variables you would like to plot. Each data file contains a TTree named t0 . You can plot the contents of all (in this example both) data file TTree's by using the TChain method as follows: In Root execute the following commands: TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Draw(\"Energy\"); When you are done with this, you can quit root again with the command .q <Return> .","title":"Step 1: Create simulated data using the grid"},{"location":"ASP2018/AnalysisExample/#step-2-analyze-real-data","text":"Now we want to have a look at a real live ATLAS root file. For this, go back to the remote terminal window on osgconnect. You will need a new condor submit script called run-z.cmd : universe=vanilla executable=run-z.sh transfer_input_files = readEvents.C,/home/pskubic/public/muons.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-z.log transfer_output_files = root-z.out,histograms-z.root output=run-z.out.$(Cluster).$(Process) error=run-z.err.$(Cluster).$(Process) notification=Never queue The new executable script you need for this job is: run-z.sh which is as follows: #!/bin/bash source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load root module load libXpm root -b -q readEvents.C+ > root-z.out This script runs Root in batch mode and executes input macro readEvents.C and produces output that is routed to file root-z.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-z.sh The macro readEvents.C consists of the following code: #include \"TFile.h\" #include \"TTree.h\" #include \"TCanvas.h\" #include \"TH1F.h\" #include \"iostream\" //#include \"TLorentzVector.h\" using namespace std; void readEvents(){ // load the ROOT ntuple file TFile * f = new TFile(\"muons.root\"); TTree *tree = (TTree *) f->Get(\"POOLCollectionTree\"); int nEntries = tree->GetEntries(); cout << \"There are \" << nEntries << \" entries in your ntuple\" << endl; // create local variables for the tree's branches UInt_t NLooseMuons; Float_t LooseMuonsEta1; Float_t LooseMuonsPhi1; Float_t LooseMuonsPt1; Float_t LooseMuonsEta2; Float_t LooseMuonsPhi2; Float_t LooseMuonsPt2; // set the tree's branches to the local variables tree->SetBranchAddress(\"NLooseMuon\", &NLooseMuons); tree->SetBranchAddress(\"LooseMuonEta1\", &LooseMuonsEta1); tree->SetBranchAddress(\"LooseMuonPhi1\", &LooseMuonsPhi1); tree->SetBranchAddress(\"LooseMuonPt1\", &LooseMuonsPt1); tree->SetBranchAddress(\"LooseMuonEta2\", &LooseMuonsEta2); tree->SetBranchAddress(\"LooseMuonPhi2\", &LooseMuonsPhi2); tree->SetBranchAddress(\"LooseMuonPt2\", &LooseMuonsPt2); // declare some histograms TH1F *muPt1 = new TH1F(\"muPt1\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx1 = new TH1F(\"muPx1\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy1 = new TH1F(\"muPy1\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz1 = new TH1F(\"muPz1\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta1 = new TH1F(\"muEta1\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi1 = new TH1F(\"muPhi1\", \";#phi;Events\", 50, -4, 4); TH1F *muE1 = new TH1F(\"muE1\", \";Energy;Events\", 50, 0, 200); TH1F *muPt2 = new TH1F(\"muPt2\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx2 = new TH1F(\"muPx2\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy2 = new TH1F(\"muPy2\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz2 = new TH1F(\"muPz2\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta2 = new TH1F(\"muEta2\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi2 = new TH1F(\"muPhi2\", \";#phi;Events\", 50, -4, 4); TH1F *muE2 = new TH1F(\"muE2\", \";Energy;Events\", 50, 0, 200); TH1F *zPt = new TH1F(\"zPt\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *zPx = new TH1F(\"zPx\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *zPy = new TH1F(\"zPy\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *zPz = new TH1F(\"zPz\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz //TH1F *zEta = new TH1F(\"zEta\", \";#eta;Events\", 50, -3, 3); //TH1F *zPhi = new TH1F(\"zPhi\", \";#phi;Events\", 50, -4, 4); TH1F *zE = new TH1F(\"zE\", \";Energy;Events\", 50, 0, 200); TH1F *zMass = new TH1F(\"zMass\", \";Mass;Events\", 50, 0, 200); // loop over each entry (event) in the tree for( int entry=0; entry < nEntries; entry++ ){ if( entry%10000 * 0 ) cout << \"Entry:\" << entry << endl; // check that the event is read properly int entryCheck = tree->GetEntry( entry ); if( entryCheck <= 0 ){ continue; } // only look at events containing at least 2 leptons if(NLooseMuons < 2) continue; // require the leptons to have some transverse momentum if(abs(LooseMuonsPt1) *0.001 < 20 || abs(LooseMuonsPt2) *0.001 < 20 ) continue; // make a LorentzVector from the muon //TLorentzVector Muons1; // Muons1.SetPtEtaPhiM(fabs(LooseMuonsPt1), LooseMuonsEta1, LooseMuonsPhi1, 0); // print out the details of an electron every so often if( entry%10000 * 0 ){ cout << \"Muons pt1: \" << LooseMuonsPt1 << \" eta: \" << LooseMuonsEta1 << \" phi \" << LooseMuonsPhi1 << endl; cout << \"Muons pt2: \" << LooseMuonsPt2 << \" eta: \" << LooseMuonsEta2 << \" phi \" << LooseMuonsPhi2 << endl; } //calculation of muon energy Double_t muonMass = 0.0; // assume the mass of the muon is negligible Double_t muonPx1 = abs(LooseMuonsPt1)*cos(LooseMuonsPhi1); Double_t muonPy1 = abs(LooseMuonsPt1)*sin(LooseMuonsPhi1); Double_t muonPz1 = abs(LooseMuonsPt1)*sinh(LooseMuonsEta1); Double_t muonEnergy1 = sqrt (muonPx1*muonPx1 + muonPy1*muonPy1 + muonPz1*muonPz1 + muonMass*muonMass); Double_t muonPx2 = abs(LooseMuonsPt2)*cos(LooseMuonsPhi2); Double_t muonPy2 = abs(LooseMuonsPt2)*sin(LooseMuonsPhi2); Double_t muonPz2 = abs(LooseMuonsPt2)*sinh(LooseMuonsEta2); Double_t muonEnergy2 = sqrt (muonPx2*muonPx2 + muonPy2*muonPy2 + muonPz2*muonPz2 + muonMass*muonMass); Double_t zCompX = muonPx1 + muonPx2; Double_t zCompY = muonPy1 + muonPy2; Double_t zLongi = muonPz1 + muonPz2; Double_t zPerp = sqrt (zCompX*zCompX + zCompY*zCompY); Double_t zEnergy = muonEnergy1 + muonEnergy2; Double_t zM = sqrt (zEnergy*zEnergy -zCompX*zCompX -zCompY*zCompY -zLongi*zLongi); // fill our histograms muPt1->Fill((LooseMuonsPt1)*0.001); // in GeV muEta1->Fill(LooseMuonsEta1); muPhi1->Fill(LooseMuonsPhi1); muPx1->Fill( muonPx1*0.001); // in GeV muPy1->Fill( muonPy1*0.001); // in GeV muPz1->Fill( muonPz1*0.001); // in GeV muE1->Fill(muonEnergy1*0.001); // in GeV muPt2->Fill((LooseMuonsPt2)*0.001); // in GeV muEta2->Fill(LooseMuonsEta2); muPhi2->Fill(LooseMuonsPhi2); muPx2->Fill( muonPx2*0.001); // in GeV muPy2->Fill( muonPy2*0.001); // in GeV muPz2->Fill( muonPz2*0.001); // in GeV muE2->Fill(muonEnergy2*0.001); // in GeV zPt->Fill( zPerp*0.001); // in GeV zPx->Fill( zCompX*0.001); // in GeV zPy->Fill( zCompY*0.001); // in GeV zPz->Fill( zLongi*0.001); // in GeV zE->Fill( zEnergy*0.001); // in GeV zMass->Fill(zM*0.001); // in GeV } // draw the eta distribution zMass->Draw(); // make a ROOT output file to store your histograms TFile *outFile = new TFile(\"histograms-z.root\", \"recreate\"); muPt1->Write(); muEta1->Write(); muPhi1->Write(); muE1->Write(); muPx1->Write(); muPy1->Write(); muPz1->Write(); muPt2->Write(); muEta2->Write(); muPhi2->Write(); muE2->Write(); muPx2->Write(); muPy2->Write(); muPz2->Write(); zPt->Write(); zE->Write(); zPx->Write(); zPy->Write(); zPz->Write(); zMass->Write(); outFile->Close(); } The grid job can be submitted using: $ condor_submit run-z.cmd It can again be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-z.log , and output file: root-z.out , and the files containing the simulated data: histograms-z.root in your test directory. You again need to copy that file into your public directory, so that you can download it to your local desktop: $ cp histograms-z.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms-z.root You can inspect the contents of histograms-z.root by running Root (i.e., root histograms-z.root ) in your current directory in your local terminal window: $ root histograms-z.root And then using the Root command: TBrowser b With the TBrowser you can plot the variables in the root file. Double click on histograms-z.root , and then on the variables to plot them.","title":"Step 2: Analyze Real Data"},{"location":"ASP2018/AnalysisExample/#step-3-make-tselector","text":"Now let's go back to the files created in step 1, in the remote terminal window. Start root in your test directory with the following commands: $ module load root $ root -b And then execute the following commands: TFile f(\"t00.root\"); t0->MakeSelector(\"s0\",\"=legacy\"); f.Close(); .q This will create files s0.C and s0.h in your test directory that contain code corresponding to the definition of the TTree t0 . This code can be used to process files containing data is these TTree's. Now we will add a histogram to the TSelector code. Several code lines have to be added to the TSelector code files s0.C and s0.h . To s0.h make the following additions: after existing include statements add: #include <TH1F.h> After class s0 definition: class s0 : public TSelector { public : add TH1F *e; To s0.C make the following additions: After entry: void s0::SlaveBegin(TTree * /*tree*/) { add e = new TH1F(\"e\", \"e\", 1000, -199.0, 1200.0); After Process entry: Bool_t s0::Process(Long64_t entry) { add GetEntry(entry); e->Fill(Energy); After terminate entry: void s0::Terminate() { add TFile f(\"histograms.root\",\"RECREATE\"); f.WriteObject(e,\"Energy\"); f.Close(); Now create the new script files for Step 2: create run-root-2.cmd : universe=vanilla executable=run-root-2.sh transfer_input_files = s0.C,s0.h,run-root-2.C,t00.root,t01.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root-2.log transfer_output_files = root-2.out,histograms.root output=run-root-2.out.$(Cluster).$(Process) error=run-root-2.err.$(Cluster).$(Process) notification=Never queue Create run-root-2.sh : #!/bin/bash source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load root module load libXpm root -b < run-root-2.C > root-2.out It has to be made executable, by use of the chmod Linux command: chmod +x run-root-2.sh Create run-root-2.C .L s0.C++ { //Load and run TSelector s0 *s = new s0(); TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Process(s); } We can test the Root job on the osgconnect training machine by issuing command: root -b < run-root-2.C If this works, we can process the data files t00.root and t01.root on the Grid with our new command script run-root-2.cmd . This can be done with command: condor_submit run-root-2.cmd Once your job has finished, you again need to copy that file into your public directory, so that you can download it to your local desktop: cp histograms.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms.root You can look at the output histogram file: histograms.root with TBrowser b as before, in your local terminal window.","title":"Step 3: Make TSelector"},{"location":"ASP2018/PH_DT2017/","text":"Polar Hackathon 2017 High-Level Cirriculum Overview Wednesday Session 1 Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Wednesday Session 2 Workflows and distributed environments Thursday Sessions Containers Jetstream demo and discussion Materials Materials Page","title":"Polar Hackathon 2017"},{"location":"ASP2018/PH_DT2017/#polar-hackathon-2017","text":"","title":"Polar Hackathon 2017"},{"location":"ASP2018/PH_DT2017/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"ASP2018/PH_DT2017/#wednesday-session-1","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Wednesday Session 1"},{"location":"ASP2018/PH_DT2017/#wednesday-session-2","text":"Workflows and distributed environments","title":"Wednesday Session 2"},{"location":"ASP2018/PH_DT2017/#thursday-sessions","text":"Containers Jetstream demo and discussion","title":"Thursday Sessions"},{"location":"ASP2018/PH_DT2017/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2018/PH_Materials/","text":"Polar Hackathon School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Wednesday - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Wednesday - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday - Computational Infrastructures - Session 3 Lecture 4 - Slides Containers with Singularity Jetstream Demo Possible if Time - Workflows Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow Extras Discipline specific tutorials Contact information Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Polar Hackathon School Materials"},{"location":"ASP2018/PH_Materials/#polar-hackathon-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Polar Hackathon School Materials"},{"location":"ASP2018/PH_Materials/#wednesday-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Wednesday - Computational Infrastructures - Session 1"},{"location":"ASP2018/PH_Materials/#wednesday-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Wednesday - Computational Infrastructures - Session 2"},{"location":"ASP2018/PH_Materials/#thursday-computational-infrastructures-session-3","text":"Lecture 4 - Slides Containers with Singularity Jetstream Demo","title":"Thursday - Computational Infrastructures - Session 3"},{"location":"ASP2018/PH_Materials/#possible-if-time-workflows","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow","title":"Possible if Time - Workflows"},{"location":"ASP2018/PH_Materials/#extras","text":"Discipline specific tutorials","title":"Extras"},{"location":"ASP2018/PH_Materials/#contact-information","text":"Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"ASP2018/School/","text":"Data Trieste 2017 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Friday AM Session Introduction to cloud environemts Close out and resources for further collaboration Detailed Schedule Thursday 20-July 2017 Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick Friday 21-July 2017 Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch Materials Materials Page","title":"Data Trieste 2017"},{"location":"ASP2018/School/#data-trieste-2017","text":"","title":"Data Trieste 2017"},{"location":"ASP2018/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"ASP2018/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"ASP2018/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Thursday PM Session"},{"location":"ASP2018/School/#friday-am-session","text":"Introduction to cloud environemts Close out and resources for further collaboration","title":"Friday AM Session"},{"location":"ASP2018/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"ASP2018/School/#thursday-20-july-2017","text":"Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick","title":"Thursday 20-July 2017"},{"location":"ASP2018/School/#friday-21-july-2017","text":"Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch","title":"Friday 21-July 2017"},{"location":"ASP2018/School/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2022/","text":"Placeholder for ASP 2018 Lessons","title":"Index"},{"location":"ASP2022/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will need your Gmail or GitHub credentials for this session. You might want to refer to the online Condor manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 9.12.0, which is a recent version of Condor. Condor has two coexisting types of releases at any given time: Feature (development) and Long Term Support (stable). Condor 9.12.0 is considered a stable release, while 9.13.1 is considered a development release. You can know 9.12.0 is stable because the second digit (a 12 in this case) is an even number, while in the development version 9.13.1 it is an odd number (13 in this case). In a given stable series, all versions have the same features (for example 9.10.16 and 9.10.17 have the same set of features) and differ only in bug fixes. Where you will work Today you will log into https://notebook.ospool.osg-htc.org/hub/login for all of your exercises: Login on submission node using a web browser: https://notebook.ospool.osg-htc.org/hub/login Click on 'Sign in with CILogon'. Select the Identity Provider Google (or GitHub). Click 'Log On'. Log into your Google account (or GitHub). Click the 'Server Option' 'DOSAR/ASP December 2022'. Click 'Start'. This will take some time. In the 'Launcher' window, click on 'Terminal' (bottom left). When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into a terminal in the ospool notebook above.","title":"High Throughput Computing and Condor Introduction"},{"location":"ASP2022/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"ASP2022/01-Introduction/#preliminaries","text":"You will need your Gmail or GitHub credentials for this session. You might want to refer to the online Condor manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"ASP2022/01-Introduction/#which-condor","text":"We will be using Condor 9.12.0, which is a recent version of Condor. Condor has two coexisting types of releases at any given time: Feature (development) and Long Term Support (stable). Condor 9.12.0 is considered a stable release, while 9.13.1 is considered a development release. You can know 9.12.0 is stable because the second digit (a 12 in this case) is an even number, while in the development version 9.13.1 it is an odd number (13 in this case). In a given stable series, all versions have the same features (for example 9.10.16 and 9.10.17 have the same set of features) and differ only in bug fixes.","title":"Which Condor?"},{"location":"ASP2022/01-Introduction/#where-you-will-work","text":"Today you will log into https://notebook.ospool.osg-htc.org/hub/login for all of your exercises: Login on submission node using a web browser: https://notebook.ospool.osg-htc.org/hub/login Click on 'Sign in with CILogon'. Select the Identity Provider Google (or GitHub). Click 'Log On'. Log into your Google account (or GitHub). Click the 'Server Option' 'DOSAR/ASP December 2022'. Click 'Start'. This will take some time. In the 'Launcher' window, click on 'Terminal' (bottom left). When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"ASP2022/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into a terminal in the ospool notebook above.","title":"The Exercises"},{"location":"ASP2022/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into https://notebook.ospool.osg-htc.org/hub/login with your Gmail or GitHub account. $ hostname jupyter-...-40gmail-2ecom Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 9.12.0 2022-10-05 BuildID: 608474 PackageID: 9.12.0-1.1 $ $CondorPlatform: X86_64-Ubuntu_20.04 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not necessarily the computer we're running on. It was built on Ubuntu 20.04, but you might notice that we're running on Ubuntu 20.04.5, which is a slightly newer version. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Ubuntu 20.04.5 LTS \\n \\l Or you can run: $ lsb_release -a o No LSB modules are available. Distributor ID: Ubuntu Description: Ubuntu 20.04.5 LTS Release: 20.04 Codename: focal Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor dead.letter execute/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_collector. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:26:20 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q (Will not work on this ospool training machine.) condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"ASP2022/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"ASP2022/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"ASP2022/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into https://notebook.ospool.osg-htc.org/hub/login with your Gmail or GitHub account. $ hostname jupyter-...-40gmail-2ecom","title":"Login to the Condor submit computer"},{"location":"ASP2022/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 9.12.0 2022-10-05 BuildID: 608474 PackageID: 9.12.0-1.1 $ $CondorPlatform: X86_64-Ubuntu_20.04 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not necessarily the computer we're running on. It was built on Ubuntu 20.04, but you might notice that we're running on Ubuntu 20.04.5, which is a slightly newer version.","title":"Looking at our Condor installation"},{"location":"ASP2022/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Ubuntu 20.04.5 LTS \\n \\l Or you can run: $ lsb_release -a o No LSB modules are available. Distributor ID: Ubuntu Description: Ubuntu 20.04.5 LTS Release: 20.04 Codename: focal Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor dead.letter execute/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_collector. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"ASP2022/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:26:20 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"ASP2022/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q (Will not work on this ospool training machine.) condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"ASP2022/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"ASP2022/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"ASP2022/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"ASP2022/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. Let's create a directory to perform the condor tests. Execute the following two commands in your Linux shell: $ mkdir -p ~/condor-test $ cd ~/condor-test Next, create a file called simple.c using your favorite editor. Use your preferred text editor to create this C program. If you don't have one, use 'nano', as shown below. The file will be in the condor-test directory. In that file, put the following text. Copy and paste is a good choice: $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } To copy and paste code into nano on ospool, it is convenient to use key strokes: CTRL-c (windows), command-c (mac) to copy; and CTRL-v (windows), command-v (mac) to paste. To save your file and exit nano, type: CTRL-x, y, return. Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 2056. Now, watch your job run (insert your username in the command below instead of YOUR_USER_ID . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (2056.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (2056.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (2056.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (2056.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 34. $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q YOUR_USERID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"ASP2022/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"ASP2022/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"ASP2022/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. Let's create a directory to perform the condor tests. Execute the following two commands in your Linux shell: $ mkdir -p ~/condor-test $ cd ~/condor-test Next, create a file called simple.c using your favorite editor. Use your preferred text editor to create this C program. If you don't have one, use 'nano', as shown below. The file will be in the condor-test directory. In that file, put the following text. Copy and paste is a good choice: $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } To copy and paste code into nano on ospool, it is convenient to use key strokes: CTRL-c (windows), command-c (mac) to copy; and CTRL-v (windows), command-v (mac) to paste. To save your file and exit nano, type: CTRL-x, y, return. Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"ASP2022/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 2056. Now, watch your job run (insert your username in the command below instead of YOUR_USER_ID . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (2056.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (2056.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (2056.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (2056.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"ASP2022/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 34. $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q YOUR_USERID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"ASP2022/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"ASP2022/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"ASP2022/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"ASP2022/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"ASP2022/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"ASP2022/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 23.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 23.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 23.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 23.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 23.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 23.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 23.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 23.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history YOUR_USER_ID For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"ASP2022/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"ASP2022/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"ASP2022/04-TipsandTricks/#tips-for-condor_q","text":"Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 23.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 23.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 23.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 23.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 23.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 23.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 23.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 23.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"ASP2022/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"ASP2022/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history YOUR_USER_ID For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"ASP2022/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you would like to a write program in the Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 You will need to make this shell script executable with chmod +x simple.sh , and then you can run it with ./simple.sh 3 4 . Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"ASP2022/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"ASP2022/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you would like to a write program in the Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 You will need to make this shell script executable with chmod +x simple.sh , and then you can run it with ./simple.sh 3 4 . Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"ASP2022/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"ASP2022/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask us. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Invoke R with the proper environment /opt/conda/bin/R --slave --vanilla < $1 You could easily execute this on OSpool by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) queue","title":"Running a job with R"},{"location":"ASP2022/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"ASP2022/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"ASP2022/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"ASP2022/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"ASP2022/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"ASP2022/06-RJob/#setup","text":"You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask us. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Invoke R with the proper environment /opt/conda/bin/R --slave --vanilla < $1 You could easily execute this on OSpool by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"ASP2022/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"ASP2022/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) queue","title":"The answer"},{"location":"ASP2022/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"ASP2022/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"ASP2022/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"ASP2022/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"ASP2022/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"ASP2022/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot . Downloading the needed executables Since your training VMs don't have the goatbrot executable needed for this exercise, we will need to download it first. Execute the following commands to do this: $ mkdir bin $ cd bin $ wget https://www.nhn.ou.edu/~hs/tmp/goatbrot $ chmod +x goatbrot $ cd .. A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif We need to download the GIF file from your training VM to your local desktop. To do so, find the file mandle.gif in the list of files and directories in the side bar to the left of your terminal window. Right-click on it and select Download, and download it to your local desktop. Point Browser at the file URL: open a Terminal on your local CentOS VM, and then type in there: firefox Downloads/mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run each goatbroat in parallel in our cluster. Here's an example you can run by hand. (This is back in your first terminal, where you are logged into the osgconnect machine.) Run goatbroat 4 times : $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Find the file mandle.gif in your side bar again. 2. Download it and display it in Firefox.","title":"A brief detour through the Mandlebrot set"},{"location":"ASP2022/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot .","title":"A brief detour through the Mandlebrot set"},{"location":"ASP2022/08-Mandlebrot/#downloading-the-needed-executables","text":"Since your training VMs don't have the goatbrot executable needed for this exercise, we will need to download it first. Execute the following commands to do this: $ mkdir bin $ cd bin $ wget https://www.nhn.ou.edu/~hs/tmp/goatbrot $ chmod +x goatbrot $ cd ..","title":"Downloading the needed executables"},{"location":"ASP2022/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif We need to download the GIF file from your training VM to your local desktop. To do so, find the file mandle.gif in the list of files and directories in the side bar to the left of your terminal window. Right-click on it and select Download, and download it to your local desktop. Point Browser at the file URL: open a Terminal on your local CentOS VM, and then type in there: firefox Downloads/mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"ASP2022/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run each goatbroat in parallel in our cluster. Here's an example you can run by hand. (This is back in your first terminal, where you are logged into the osgconnect machine.) Run goatbroat 4 times : $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"ASP2022/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Find the file mandle.gif in your side bar again. 2. Download it and display it in Firefox.","title":"Try it!"},{"location":"ASP2022/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file (called submit ) that has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. Make sure you are in the condor-test directory that you used before. cd ~/condor-test . If the directory condor-test was deleted, you will have to create it again and create and compile the simple program again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. Click the + sign just to the right of the title 'Terminal 1' -- NOT the x! -- in order to open additional terminals. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . JOB simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue. As before, replace YOUR_USER_ID with your username. (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q YOUR_USER_ID -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here? Challenge What is the scheduler universe? Why does DAGMan use it?","title":"Coordinating set of jobs: A simple DAG"},{"location":"ASP2022/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"ASP2022/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"ASP2022/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"ASP2022/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file (called submit ) that has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. Make sure you are in the condor-test directory that you used before. cd ~/condor-test . If the directory condor-test was deleted, you will have to create it again and create and compile the simple program again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. Click the + sign just to the right of the title 'Terminal 1' -- NOT the x! -- in order to open additional terminals. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . JOB simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue. As before, replace YOUR_USER_ID with your username. (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q YOUR_USER_ID -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"ASP2022/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"ASP2022/09-SimpleDAG/#challenge","text":"What is the scheduler universe? Why does DAGMan use it?","title":"Challenge"},{"location":"ASP2022/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. You have placed the goatbrot executable in your bin directory: ~/bin/goatbrot . Condor does not deal well with ~/ as the home directory, so we will use the full path /home/jovyan/bin/ instead in the submit scripts, which goes to the same directory. goatbrot1.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on the VM. universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch -n 1 condor_q YOUR_USER_ID -nobatch To quit watch command, press Ctrl-c . Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can download it again from the left side bar, and then display it with Firefox. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"ASP2022/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"ASP2022/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"ASP2022/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. You have placed the goatbrot executable in your bin directory: ~/bin/goatbrot . Condor does not deal well with ~/ as the home directory, so we will use the full path /home/jovyan/bin/ instead in the submit scripts, which goes to the same directory.","title":"Make your job submission files"},{"location":"ASP2022/10-ComplexDAG/#goatbrot1sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"ASP2022/10-ComplexDAG/#goatbrot2sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"ASP2022/10-ComplexDAG/#goatbrot3sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"ASP2022/10-ComplexDAG/#goatbrot4sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"ASP2022/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on the VM. universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue","title":"montage.sub"},{"location":"ASP2022/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"ASP2022/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"ASP2022/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch -n 1 condor_q YOUR_USER_ID -nobatch To quit watch command, press Ctrl-c . Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can download it again from the left side bar, and then display it with Firefox. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"ASP2022/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"ASP2022/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this: universe = vanilla executable = /usr/bin/montage arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the rescue DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Handling a DAG that fails"},{"location":"ASP2022/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"ASP2022/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this: universe = vanilla executable = /usr/bin/montage arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the rescue DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"ASP2022/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"ASP2022/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"ASP2022/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"ASP2022/12-VariableSubstitution/#declare-your-variables","text":"First declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"ASP2022/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"ASP2022/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"ASP2022/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"ASP2022/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"ASP2022/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"ASP2022/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"ASP2022/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"ASP2022/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"ASP2022/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"ASP2022/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"ASP2022/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"ASP2022/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"ASP2022/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"ASP2022/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"ASP2022/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"ASP2022/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"ASP2022/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"ASP2022/ASP2022_Materials/","text":"ASP 2022 Materials We will be using OSG Training Pool for this set of sessions. Please visit https://notebook.ospool.osg-htc.org/hub/login to log in. Wednesday Morning - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Intro-Slides Lecture1-Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Wednesday Afternoon - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Wednesday Afternoon - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Thursday Morning - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Thursday Morning - Computational Infrastructures - Session 5 ATLAS Analysis Example Lecture ATLAS Analysis Example Exercise Computational Infrastructures Wrap Up - Slides Contact information Rob Quick - rquick@iu.edu Horst Severini - severini@ou.edu Patrick Skubic - pskubic@ou.edu Julia Gray - julia.ann.gray@gmail.com Jae Yu - jaehoonyu@uta.edu Chris Walker - walker@nhn.ou.edu If you want long\u2010term grid access, you can go to http://www.osgconnect.net/ and sign up","title":"ASP 2022 Materials"},{"location":"ASP2022/ASP2022_Materials/#asp-2022-materials","text":"We will be using OSG Training Pool for this set of sessions. Please visit https://notebook.ospool.osg-htc.org/hub/login to log in.","title":"ASP 2022 Materials"},{"location":"ASP2022/ASP2022_Materials/#wednesday-morning-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Intro-Slides Lecture1-Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Wednesday Morning - Computational Infrastructures - Session 1"},{"location":"ASP2022/ASP2022_Materials/#wednesday-afternoon-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Wednesday Afternoon - Computational Infrastructures - Session 2"},{"location":"ASP2022/ASP2022_Materials/#wednesday-afternoon-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Wednesday Afternoon - Computational Infrastructures - Session 3"},{"location":"ASP2022/ASP2022_Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"ASP2022/ASP2022_Materials/#thursday-morning-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Thursday Morning - Computational Infrastructures - Session 4"},{"location":"ASP2022/ASP2022_Materials/#thursday-morning-computational-infrastructures-session-5","text":"ATLAS Analysis Example Lecture ATLAS Analysis Example Exercise Computational Infrastructures Wrap Up - Slides","title":"Thursday Morning - Computational Infrastructures - Session 5"},{"location":"ASP2022/ASP2022_Materials/#contact-information","text":"Rob Quick - rquick@iu.edu Horst Severini - severini@ou.edu Patrick Skubic - pskubic@ou.edu Julia Gray - julia.ann.gray@gmail.com Jae Yu - jaehoonyu@uta.edu Chris Walker - walker@nhn.ou.edu If you want long\u2010term grid access, you can go to http://www.osgconnect.net/ and sign up","title":"Contact information"},{"location":"ASP2022/ASP2022_Schedule/","text":"ASP 2022 High-Level Curriculum Overview Wednesday AM Session Welcome and Background The Landscape of Research Computing Wednesday PM Session Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Thursday AM Session Brief Introduction to clouds and containers ATLAS Analysis Example Close out and resources for further collaboration Detailed Schedule Wednesday 7-December 2022 Time Description Instructor 11:00 Welcome and the Landscape of Research Computing Jae Yu 11:45 Exercise - UNIX Refresher, Running simple Condor jobs All 12:30 Lunch 14:00 Profiling your application and finding a home for your workflow Pat Skubic 14:45 Exercise - Running scripting and R jobs with Condor All 15:30 Coffee Break 16:00 Workflows and distributed environments Horst Severini 16:45 Exercise - DAGMan All Thursday 8-December 2022 Time Description Instructor 09:00 A Brief Introduction to Clouds and Containers Horst Severini 09:45 Exercise - Using Containers on OSG and Discpline Specfic Tutorials All 10:30 Coffee Break 11:00 ATLAS Analysis Example and Exercises Pat Skubic 11:45 Closing Thoughts All 12:30 Lunch Materials Materials Page","title":"ASP 2022 Schedule"},{"location":"ASP2022/ASP2022_Schedule/#asp-2022","text":"","title":"ASP 2022"},{"location":"ASP2022/ASP2022_Schedule/#high-level-curriculum-overview","text":"","title":"High-Level Curriculum Overview"},{"location":"ASP2022/ASP2022_Schedule/#wednesday-am-session","text":"Welcome and Background The Landscape of Research Computing","title":"Wednesday AM Session"},{"location":"ASP2022/ASP2022_Schedule/#wednesday-pm-session","text":"Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Wednesday PM Session"},{"location":"ASP2022/ASP2022_Schedule/#thursday-am-session","text":"Brief Introduction to clouds and containers ATLAS Analysis Example Close out and resources for further collaboration","title":"Thursday AM Session"},{"location":"ASP2022/ASP2022_Schedule/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"ASP2022/ASP2022_Schedule/#wednesday-7-december-2022","text":"Time Description Instructor 11:00 Welcome and the Landscape of Research Computing Jae Yu 11:45 Exercise - UNIX Refresher, Running simple Condor jobs All 12:30 Lunch 14:00 Profiling your application and finding a home for your workflow Pat Skubic 14:45 Exercise - Running scripting and R jobs with Condor All 15:30 Coffee Break 16:00 Workflows and distributed environments Horst Severini 16:45 Exercise - DAGMan All","title":"Wednesday 7-December 2022"},{"location":"ASP2022/ASP2022_Schedule/#thursday-8-december-2022","text":"Time Description Instructor 09:00 A Brief Introduction to Clouds and Containers Horst Severini 09:45 Exercise - Using Containers on OSG and Discpline Specfic Tutorials All 10:30 Coffee Break 11:00 ATLAS Analysis Example and Exercises Pat Skubic 11:45 Closing Thoughts All 12:30 Lunch","title":"Thursday 8-December 2022"},{"location":"ASP2022/ASP2022_Schedule/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2022/AnalysisExample/","text":"ATLAS Analysis Example Introduction Root may be run in batch mode on the grid to analyze large data samples. This example creates simulated data in root format using trees and performs analysis on the simulated data by means of processing on the grid. This example is based on a demo developed by OU programmer Chris Walker. Prerequisite Open a new Terminal on your local desktop. NOTE: You are no longer using the browser based terminal now, but the Terminal on your CentOS VM, just like you did to display mandle.gif with firefox. Make a directory for this exercise $ mkdir -p analysis_example $ cd analysis_example Again the $ sign at the beginning of the commands to execute is the command prompt , so it should not be entered as part of the command. Simple Analysis Example Step 1: Create simulated data using the grid Note: Since the new training VMs on OSpool do not support running root, we will run root on the local desktops instead of using condor. So we will not need the condor submit scripts below but we will leave the instructions for them for future reference. Now in your test directory we will create the three files: run-root.cmd , run-root.sh , and run-root.C with the contents given below. This may require running an editor such as emacs or nano on your local desktop. We will not submit grid jobs so the \"run-root.cmd\" script is not needed for this exercise. ----------------------------- Skip from here-1 ----------------------------------------- First, we will utilize a simple command script to submit the grid jobs. It is run-root.cmd : universe=vanilla executable=run-root.sh transfer_input_files = run-root.C transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root.log transfer_output_files = root.out,t00.root,t01.root output=run-root.out.$(Cluster).$(Process) error=run-root.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-1 ---------------------------------------------------- Note that the executable script is: run-root.sh which is as follows: #!/bin/bash root -b < run-root.C > root.out This script runs Root in batch mode and executes input macro run-root.C and produces output that is routed to file root.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-root.sh The macro run-root.C consists of the following code: { // create files containing simulated data TRandom g; char c[256]; for ( int j = 0 ; j < 2 ; j++ ){ sprintf(c,\"t%2.2d.root\\000\",j); TFile f(c,\"RECREATE\",\"MyFile\", 0/*no compression*/); TTree *t = new TTree(\"t0\",\"t0\"); Int_t Run; TBranch * b_Run = t->Branch(\"Run\",&Run); Int_t Event; TBranch * b_Event = t->Branch(\"Event\",&Event); Float_t Energy; TBranch * b_Energy = t->Branch(\"Energy\",&Energy); Run = j; for( Event = 0 ; Event < 100 ; Event++ ){ Energy = g.Gaus(500.0 , 200.0); t->Fill(); } f.Write(); f.Close(); } } .q We will not submit grid jobs during this exercise. So we will skip to running root. ----------------------------- Skip from here-2 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-root.cmd It can be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-root.log , and output file: root.out , and the files containing the simulated data: t00.root , t01.root in your test directory. You need to copy these files into your public directory, so that you can download it to your local desktop: $ cp t0*.root ~/public/ Now open a different terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/t00.root http://stash.osgconnect.net/~YOUR_USER_ID/t01.root ----------------------------------------------- Skip to here-2 ---------------------------------------------------- Execute the script to run root: ./run-root.sh You can then inspect the contents of t00.root and t01.root by running root in your current directory in the local terminal window: $ root t00.root And then the Root command: TBrowser b With the TBrowser you can plot the simulated data in branch Energy as well as the other branches. Double click on the name of the root files, and then on the variables you would like to plot. Each data file contains a TTree named t0 . You can plot the contents of all (in this example both) data file TTree's by using the TChain method as follows: In root execute the following commands: TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Draw(\"Energy\"); When you are done with this, you can quit root again with the command .q <Return> . Step 2: Analyze Real Data We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-3 ----------------------------------------- The grid job can be submitted using: Now we want to have a look at a real live ATLAS root file. For this, go back to the remote terminal window on osgconnect. You will need a new condor submit script called run-z.cmd : universe=vanilla executable=run-z.sh transfer_input_files = readEvents.C,/home/pskubic/public/muons.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-z.log transfer_output_files = root-z.out,histograms-z.root output=run-z.out.$(Cluster).$(Process) error=run-z.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-3 ---------------------------------------------------- The new executable script you need for this job is: run-z.sh which is as follows: #!/bin/bash root -b -q readEvents.C+ > root-z.out This script runs root in batch mode and executes input macro readEvents.C and produces output that is routed to file root-z.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-z.sh The macro readEvents.C consists of the following code: #include \"TFile.h\" #include \"TTree.h\" #include \"TCanvas.h\" #include \"TH1F.h\" #include \"iostream\" //#include \"TLorentzVector.h\" using namespace std; void readEvents(){ // load the ROOT ntuple file TFile * f = new TFile(\"muons.root\"); TTree *tree = (TTree *) f->Get(\"POOLCollectionTree\"); int nEntries = tree->GetEntries(); cout << \"There are \" << nEntries << \" entries in your ntuple\" << endl; // create local variables for the tree's branches UInt_t NLooseMuons; Float_t LooseMuonsEta1; Float_t LooseMuonsPhi1; Float_t LooseMuonsPt1; Float_t LooseMuonsEta2; Float_t LooseMuonsPhi2; Float_t LooseMuonsPt2; // set the tree's branches to the local variables tree->SetBranchAddress(\"NLooseMuon\", &NLooseMuons); tree->SetBranchAddress(\"LooseMuonEta1\", &LooseMuonsEta1); tree->SetBranchAddress(\"LooseMuonPhi1\", &LooseMuonsPhi1); tree->SetBranchAddress(\"LooseMuonPt1\", &LooseMuonsPt1); tree->SetBranchAddress(\"LooseMuonEta2\", &LooseMuonsEta2); tree->SetBranchAddress(\"LooseMuonPhi2\", &LooseMuonsPhi2); tree->SetBranchAddress(\"LooseMuonPt2\", &LooseMuonsPt2); // declare some histograms TH1F *muPt1 = new TH1F(\"muPt1\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx1 = new TH1F(\"muPx1\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy1 = new TH1F(\"muPy1\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz1 = new TH1F(\"muPz1\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta1 = new TH1F(\"muEta1\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi1 = new TH1F(\"muPhi1\", \";#phi;Events\", 50, -4, 4); TH1F *muE1 = new TH1F(\"muE1\", \";Energy;Events\", 50, 0, 200); TH1F *muPt2 = new TH1F(\"muPt2\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx2 = new TH1F(\"muPx2\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy2 = new TH1F(\"muPy2\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz2 = new TH1F(\"muPz2\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta2 = new TH1F(\"muEta2\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi2 = new TH1F(\"muPhi2\", \";#phi;Events\", 50, -4, 4); TH1F *muE2 = new TH1F(\"muE2\", \";Energy;Events\", 50, 0, 200); TH1F *zPt = new TH1F(\"zPt\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *zPx = new TH1F(\"zPx\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *zPy = new TH1F(\"zPy\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *zPz = new TH1F(\"zPz\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz //TH1F *zEta = new TH1F(\"zEta\", \";#eta;Events\", 50, -3, 3); //TH1F *zPhi = new TH1F(\"zPhi\", \";#phi;Events\", 50, -4, 4); TH1F *zE = new TH1F(\"zE\", \";Energy;Events\", 50, 0, 200); TH1F *zMass = new TH1F(\"zMass\", \";Mass;Events\", 50, 0, 200); // loop over each entry (event) in the tree for( int entry=0; entry < nEntries; entry++ ){ if( entry%10000 * 0 ) cout << \"Entry:\" << entry << endl; // check that the event is read properly int entryCheck = tree->GetEntry( entry ); if( entryCheck <= 0 ){ continue; } // only look at events containing at least 2 leptons if(NLooseMuons < 2) continue; // require the leptons to have some transverse momentum if(abs(LooseMuonsPt1) *0.001 < 20 || abs(LooseMuonsPt2) *0.001 < 20 ) continue; // make a LorentzVector from the muon //TLorentzVector Muons1; // Muons1.SetPtEtaPhiM(fabs(LooseMuonsPt1), LooseMuonsEta1, LooseMuonsPhi1, 0); // print out the details of an electron every so often if( entry%10000 * 0 ){ cout << \"Muons pt1: \" << LooseMuonsPt1 << \" eta: \" << LooseMuonsEta1 << \" phi \" << LooseMuonsPhi1 << endl; cout << \"Muons pt2: \" << LooseMuonsPt2 << \" eta: \" << LooseMuonsEta2 << \" phi \" << LooseMuonsPhi2 << endl; } //calculation of muon energy Double_t muonMass = 0.0; // assume the mass of the muon is negligible Double_t muonPx1 = abs(LooseMuonsPt1)*cos(LooseMuonsPhi1); Double_t muonPy1 = abs(LooseMuonsPt1)*sin(LooseMuonsPhi1); Double_t muonPz1 = abs(LooseMuonsPt1)*sinh(LooseMuonsEta1); Double_t muonEnergy1 = sqrt (muonPx1*muonPx1 + muonPy1*muonPy1 + muonPz1*muonPz1 + muonMass*muonMass); Double_t muonPx2 = abs(LooseMuonsPt2)*cos(LooseMuonsPhi2); Double_t muonPy2 = abs(LooseMuonsPt2)*sin(LooseMuonsPhi2); Double_t muonPz2 = abs(LooseMuonsPt2)*sinh(LooseMuonsEta2); Double_t muonEnergy2 = sqrt (muonPx2*muonPx2 + muonPy2*muonPy2 + muonPz2*muonPz2 + muonMass*muonMass); Double_t zCompX = muonPx1 + muonPx2; Double_t zCompY = muonPy1 + muonPy2; Double_t zLongi = muonPz1 + muonPz2; Double_t zPerp = sqrt (zCompX*zCompX + zCompY*zCompY); Double_t zEnergy = muonEnergy1 + muonEnergy2; Double_t zM = sqrt (zEnergy*zEnergy -zCompX*zCompX -zCompY*zCompY -zLongi*zLongi); // fill our histograms muPt1->Fill((LooseMuonsPt1)*0.001); // in GeV muEta1->Fill(LooseMuonsEta1); muPhi1->Fill(LooseMuonsPhi1); muPx1->Fill( muonPx1*0.001); // in GeV muPy1->Fill( muonPy1*0.001); // in GeV muPz1->Fill( muonPz1*0.001); // in GeV muE1->Fill(muonEnergy1*0.001); // in GeV muPt2->Fill((LooseMuonsPt2)*0.001); // in GeV muEta2->Fill(LooseMuonsEta2); muPhi2->Fill(LooseMuonsPhi2); muPx2->Fill( muonPx2*0.001); // in GeV muPy2->Fill( muonPy2*0.001); // in GeV muPz2->Fill( muonPz2*0.001); // in GeV muE2->Fill(muonEnergy2*0.001); // in GeV zPt->Fill( zPerp*0.001); // in GeV zPx->Fill( zCompX*0.001); // in GeV zPy->Fill( zCompY*0.001); // in GeV zPz->Fill( zLongi*0.001); // in GeV zE->Fill( zEnergy*0.001); // in GeV zMass->Fill(zM*0.001); // in GeV } // draw the eta distribution zMass->Draw(); // make a ROOT output file to store your histograms TFile *outFile = new TFile(\"histograms-z.root\", \"recreate\"); muPt1->Write(); muEta1->Write(); muPhi1->Write(); muE1->Write(); muPx1->Write(); muPy1->Write(); muPz1->Write(); muPt2->Write(); muEta2->Write(); muPhi2->Write(); muE2->Write(); muPx2->Write(); muPy2->Write(); muPz2->Write(); zPt->Write(); zE->Write(); zPx->Write(); zPy->Write(); zPz->Write(); zMass->Write(); outFile->Close(); } We will not submit grid jobs during this exercise. So we will skip to run root. ----------------------------- Skip from here-4 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-z.cmd It can again be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-z.log , and output file: root-z.out , and the files containing the simulated data: histograms-z.root in your test directory. You again need to copy that file into your public directory, so that you can download it to your local desktop: $ cp histograms-z.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms-z.root ----------------------------------------------- Skip to here-4 ---------------------------------------------------- Setup a soft link to the input data file, muons.root, and execute the script to run root: ln -s /opt/data/muons.root . ./run-z.sh You can inspect the contents of histograms-z.root by running Root (i.e., root histograms-z.root ) in your current directory in your local terminal window: $ root histograms-z.root And then using the Root command: TBrowser b With the TBrowser you can plot the variables in the root file. Double click on histograms-z.root , and then on the variables to plot them. Step 3: Make TSelector Now let's go back to the files created in step 1, in the local terminal window. Start root in your test directory with the following commands: $ root -b And then execute the following commands: TFile f(\"t00.root\"); t0->MakeSelector(\"s0\",\"=legacy\"); f.Close(); .q This will create files s0.C and s0.h in your test directory that contain code corresponding to the definition of the TTree t0 . This code can be used to process files containing data in these TTree's. Now we will add a histogram to the TSelector code. Several code lines have to be added to the TSelector code files s0.C and s0.h . To s0.h make the following additions: after existing include statements add: #include <TH1F.h> After class s0 definition: class s0 : public TSelector { public : add TH1F *e; To s0.C make the following additions: After entry: void s0::SlaveBegin(TTree * /*tree*/) { add e = new TH1F(\"e\", \"e\", 1000, -199.0, 1200.0); After Process entry: Bool_t s0::Process(Long64_t entry) { add GetEntry(entry); e->Fill(Energy); After terminate entry: void s0::Terminate() { add TFile f(\"histograms.root\",\"RECREATE\"); f.WriteObject(e,\"Energy\"); f.Close(); We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-5 ----------------------------------------- Now create the new script files for Step 2: create run-root-2.cmd : universe=vanilla executable=run-root-2.sh transfer_input_files = s0.C,s0.h,run-root-2.C,t00.root,t01.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root-2.log transfer_output_files = root-2.out,histograms.root output=run-root-2.out.$(Cluster).$(Process) error=run-root-2.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-5 ---------------------------------------------------- Create run-root-2.sh : #!/bin/bash root -b < run-root-2.C > root-2.out It has to be made executable, by use of the chmod Linux command: chmod +x run-root-2.sh Create run-root-2.C .L s0.C++ { //Load and run TSelector s0 *s = new s0(); TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Process(s); } We can test the root job on the local machine by executing the script to run root: ./run-root-2.sh We will not submit grid jobs during this exercise. So we will skip running condor. ----------------------------- Skip from here-6 ----------------------------------------- If this works, we can process the data files t00.root and t01.root on the Grid with our new command script run-root-2.cmd . This can be done with command: condor_submit run-root-2.cmd Once your job has finished, you again need to copy that file into your public directory, so that you can download it to your local desktop: cp histograms.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms.root ----------------------------------------------- Skip to here-6 ---------------------------------------------------- You can look at the output histogram file: histograms.root with TBrowser b as before, in your local terminal window.","title":"ATLAS Analysis Example"},{"location":"ASP2022/AnalysisExample/#atlas-analysis-example","text":"","title":"ATLAS Analysis Example"},{"location":"ASP2022/AnalysisExample/#introduction","text":"Root may be run in batch mode on the grid to analyze large data samples. This example creates simulated data in root format using trees and performs analysis on the simulated data by means of processing on the grid. This example is based on a demo developed by OU programmer Chris Walker.","title":"Introduction"},{"location":"ASP2022/AnalysisExample/#prerequisite","text":"Open a new Terminal on your local desktop. NOTE: You are no longer using the browser based terminal now, but the Terminal on your CentOS VM, just like you did to display mandle.gif with firefox. Make a directory for this exercise $ mkdir -p analysis_example $ cd analysis_example Again the $ sign at the beginning of the commands to execute is the command prompt , so it should not be entered as part of the command.","title":"Prerequisite"},{"location":"ASP2022/AnalysisExample/#simple-analysis-example","text":"","title":"Simple Analysis Example"},{"location":"ASP2022/AnalysisExample/#step-1-create-simulated-data-using-the-grid","text":"Note: Since the new training VMs on OSpool do not support running root, we will run root on the local desktops instead of using condor. So we will not need the condor submit scripts below but we will leave the instructions for them for future reference. Now in your test directory we will create the three files: run-root.cmd , run-root.sh , and run-root.C with the contents given below. This may require running an editor such as emacs or nano on your local desktop. We will not submit grid jobs so the \"run-root.cmd\" script is not needed for this exercise. ----------------------------- Skip from here-1 ----------------------------------------- First, we will utilize a simple command script to submit the grid jobs. It is run-root.cmd : universe=vanilla executable=run-root.sh transfer_input_files = run-root.C transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root.log transfer_output_files = root.out,t00.root,t01.root output=run-root.out.$(Cluster).$(Process) error=run-root.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-1 ---------------------------------------------------- Note that the executable script is: run-root.sh which is as follows: #!/bin/bash root -b < run-root.C > root.out This script runs Root in batch mode and executes input macro run-root.C and produces output that is routed to file root.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-root.sh The macro run-root.C consists of the following code: { // create files containing simulated data TRandom g; char c[256]; for ( int j = 0 ; j < 2 ; j++ ){ sprintf(c,\"t%2.2d.root\\000\",j); TFile f(c,\"RECREATE\",\"MyFile\", 0/*no compression*/); TTree *t = new TTree(\"t0\",\"t0\"); Int_t Run; TBranch * b_Run = t->Branch(\"Run\",&Run); Int_t Event; TBranch * b_Event = t->Branch(\"Event\",&Event); Float_t Energy; TBranch * b_Energy = t->Branch(\"Energy\",&Energy); Run = j; for( Event = 0 ; Event < 100 ; Event++ ){ Energy = g.Gaus(500.0 , 200.0); t->Fill(); } f.Write(); f.Close(); } } .q We will not submit grid jobs during this exercise. So we will skip to running root. ----------------------------- Skip from here-2 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-root.cmd It can be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-root.log , and output file: root.out , and the files containing the simulated data: t00.root , t01.root in your test directory. You need to copy these files into your public directory, so that you can download it to your local desktop: $ cp t0*.root ~/public/ Now open a different terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/t00.root http://stash.osgconnect.net/~YOUR_USER_ID/t01.root ----------------------------------------------- Skip to here-2 ---------------------------------------------------- Execute the script to run root: ./run-root.sh You can then inspect the contents of t00.root and t01.root by running root in your current directory in the local terminal window: $ root t00.root And then the Root command: TBrowser b With the TBrowser you can plot the simulated data in branch Energy as well as the other branches. Double click on the name of the root files, and then on the variables you would like to plot. Each data file contains a TTree named t0 . You can plot the contents of all (in this example both) data file TTree's by using the TChain method as follows: In root execute the following commands: TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Draw(\"Energy\"); When you are done with this, you can quit root again with the command .q <Return> .","title":"Step 1: Create simulated data using the grid"},{"location":"ASP2022/AnalysisExample/#step-2-analyze-real-data","text":"We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-3 ----------------------------------------- The grid job can be submitted using: Now we want to have a look at a real live ATLAS root file. For this, go back to the remote terminal window on osgconnect. You will need a new condor submit script called run-z.cmd : universe=vanilla executable=run-z.sh transfer_input_files = readEvents.C,/home/pskubic/public/muons.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-z.log transfer_output_files = root-z.out,histograms-z.root output=run-z.out.$(Cluster).$(Process) error=run-z.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-3 ---------------------------------------------------- The new executable script you need for this job is: run-z.sh which is as follows: #!/bin/bash root -b -q readEvents.C+ > root-z.out This script runs root in batch mode and executes input macro readEvents.C and produces output that is routed to file root-z.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-z.sh The macro readEvents.C consists of the following code: #include \"TFile.h\" #include \"TTree.h\" #include \"TCanvas.h\" #include \"TH1F.h\" #include \"iostream\" //#include \"TLorentzVector.h\" using namespace std; void readEvents(){ // load the ROOT ntuple file TFile * f = new TFile(\"muons.root\"); TTree *tree = (TTree *) f->Get(\"POOLCollectionTree\"); int nEntries = tree->GetEntries(); cout << \"There are \" << nEntries << \" entries in your ntuple\" << endl; // create local variables for the tree's branches UInt_t NLooseMuons; Float_t LooseMuonsEta1; Float_t LooseMuonsPhi1; Float_t LooseMuonsPt1; Float_t LooseMuonsEta2; Float_t LooseMuonsPhi2; Float_t LooseMuonsPt2; // set the tree's branches to the local variables tree->SetBranchAddress(\"NLooseMuon\", &NLooseMuons); tree->SetBranchAddress(\"LooseMuonEta1\", &LooseMuonsEta1); tree->SetBranchAddress(\"LooseMuonPhi1\", &LooseMuonsPhi1); tree->SetBranchAddress(\"LooseMuonPt1\", &LooseMuonsPt1); tree->SetBranchAddress(\"LooseMuonEta2\", &LooseMuonsEta2); tree->SetBranchAddress(\"LooseMuonPhi2\", &LooseMuonsPhi2); tree->SetBranchAddress(\"LooseMuonPt2\", &LooseMuonsPt2); // declare some histograms TH1F *muPt1 = new TH1F(\"muPt1\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx1 = new TH1F(\"muPx1\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy1 = new TH1F(\"muPy1\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz1 = new TH1F(\"muPz1\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta1 = new TH1F(\"muEta1\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi1 = new TH1F(\"muPhi1\", \";#phi;Events\", 50, -4, 4); TH1F *muE1 = new TH1F(\"muE1\", \";Energy;Events\", 50, 0, 200); TH1F *muPt2 = new TH1F(\"muPt2\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx2 = new TH1F(\"muPx2\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy2 = new TH1F(\"muPy2\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz2 = new TH1F(\"muPz2\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta2 = new TH1F(\"muEta2\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi2 = new TH1F(\"muPhi2\", \";#phi;Events\", 50, -4, 4); TH1F *muE2 = new TH1F(\"muE2\", \";Energy;Events\", 50, 0, 200); TH1F *zPt = new TH1F(\"zPt\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *zPx = new TH1F(\"zPx\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *zPy = new TH1F(\"zPy\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *zPz = new TH1F(\"zPz\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz //TH1F *zEta = new TH1F(\"zEta\", \";#eta;Events\", 50, -3, 3); //TH1F *zPhi = new TH1F(\"zPhi\", \";#phi;Events\", 50, -4, 4); TH1F *zE = new TH1F(\"zE\", \";Energy;Events\", 50, 0, 200); TH1F *zMass = new TH1F(\"zMass\", \";Mass;Events\", 50, 0, 200); // loop over each entry (event) in the tree for( int entry=0; entry < nEntries; entry++ ){ if( entry%10000 * 0 ) cout << \"Entry:\" << entry << endl; // check that the event is read properly int entryCheck = tree->GetEntry( entry ); if( entryCheck <= 0 ){ continue; } // only look at events containing at least 2 leptons if(NLooseMuons < 2) continue; // require the leptons to have some transverse momentum if(abs(LooseMuonsPt1) *0.001 < 20 || abs(LooseMuonsPt2) *0.001 < 20 ) continue; // make a LorentzVector from the muon //TLorentzVector Muons1; // Muons1.SetPtEtaPhiM(fabs(LooseMuonsPt1), LooseMuonsEta1, LooseMuonsPhi1, 0); // print out the details of an electron every so often if( entry%10000 * 0 ){ cout << \"Muons pt1: \" << LooseMuonsPt1 << \" eta: \" << LooseMuonsEta1 << \" phi \" << LooseMuonsPhi1 << endl; cout << \"Muons pt2: \" << LooseMuonsPt2 << \" eta: \" << LooseMuonsEta2 << \" phi \" << LooseMuonsPhi2 << endl; } //calculation of muon energy Double_t muonMass = 0.0; // assume the mass of the muon is negligible Double_t muonPx1 = abs(LooseMuonsPt1)*cos(LooseMuonsPhi1); Double_t muonPy1 = abs(LooseMuonsPt1)*sin(LooseMuonsPhi1); Double_t muonPz1 = abs(LooseMuonsPt1)*sinh(LooseMuonsEta1); Double_t muonEnergy1 = sqrt (muonPx1*muonPx1 + muonPy1*muonPy1 + muonPz1*muonPz1 + muonMass*muonMass); Double_t muonPx2 = abs(LooseMuonsPt2)*cos(LooseMuonsPhi2); Double_t muonPy2 = abs(LooseMuonsPt2)*sin(LooseMuonsPhi2); Double_t muonPz2 = abs(LooseMuonsPt2)*sinh(LooseMuonsEta2); Double_t muonEnergy2 = sqrt (muonPx2*muonPx2 + muonPy2*muonPy2 + muonPz2*muonPz2 + muonMass*muonMass); Double_t zCompX = muonPx1 + muonPx2; Double_t zCompY = muonPy1 + muonPy2; Double_t zLongi = muonPz1 + muonPz2; Double_t zPerp = sqrt (zCompX*zCompX + zCompY*zCompY); Double_t zEnergy = muonEnergy1 + muonEnergy2; Double_t zM = sqrt (zEnergy*zEnergy -zCompX*zCompX -zCompY*zCompY -zLongi*zLongi); // fill our histograms muPt1->Fill((LooseMuonsPt1)*0.001); // in GeV muEta1->Fill(LooseMuonsEta1); muPhi1->Fill(LooseMuonsPhi1); muPx1->Fill( muonPx1*0.001); // in GeV muPy1->Fill( muonPy1*0.001); // in GeV muPz1->Fill( muonPz1*0.001); // in GeV muE1->Fill(muonEnergy1*0.001); // in GeV muPt2->Fill((LooseMuonsPt2)*0.001); // in GeV muEta2->Fill(LooseMuonsEta2); muPhi2->Fill(LooseMuonsPhi2); muPx2->Fill( muonPx2*0.001); // in GeV muPy2->Fill( muonPy2*0.001); // in GeV muPz2->Fill( muonPz2*0.001); // in GeV muE2->Fill(muonEnergy2*0.001); // in GeV zPt->Fill( zPerp*0.001); // in GeV zPx->Fill( zCompX*0.001); // in GeV zPy->Fill( zCompY*0.001); // in GeV zPz->Fill( zLongi*0.001); // in GeV zE->Fill( zEnergy*0.001); // in GeV zMass->Fill(zM*0.001); // in GeV } // draw the eta distribution zMass->Draw(); // make a ROOT output file to store your histograms TFile *outFile = new TFile(\"histograms-z.root\", \"recreate\"); muPt1->Write(); muEta1->Write(); muPhi1->Write(); muE1->Write(); muPx1->Write(); muPy1->Write(); muPz1->Write(); muPt2->Write(); muEta2->Write(); muPhi2->Write(); muE2->Write(); muPx2->Write(); muPy2->Write(); muPz2->Write(); zPt->Write(); zE->Write(); zPx->Write(); zPy->Write(); zPz->Write(); zMass->Write(); outFile->Close(); } We will not submit grid jobs during this exercise. So we will skip to run root. ----------------------------- Skip from here-4 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-z.cmd It can again be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-z.log , and output file: root-z.out , and the files containing the simulated data: histograms-z.root in your test directory. You again need to copy that file into your public directory, so that you can download it to your local desktop: $ cp histograms-z.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms-z.root ----------------------------------------------- Skip to here-4 ---------------------------------------------------- Setup a soft link to the input data file, muons.root, and execute the script to run root: ln -s /opt/data/muons.root . ./run-z.sh You can inspect the contents of histograms-z.root by running Root (i.e., root histograms-z.root ) in your current directory in your local terminal window: $ root histograms-z.root And then using the Root command: TBrowser b With the TBrowser you can plot the variables in the root file. Double click on histograms-z.root , and then on the variables to plot them.","title":"Step 2: Analyze Real Data"},{"location":"ASP2022/AnalysisExample/#step-3-make-tselector","text":"Now let's go back to the files created in step 1, in the local terminal window. Start root in your test directory with the following commands: $ root -b And then execute the following commands: TFile f(\"t00.root\"); t0->MakeSelector(\"s0\",\"=legacy\"); f.Close(); .q This will create files s0.C and s0.h in your test directory that contain code corresponding to the definition of the TTree t0 . This code can be used to process files containing data in these TTree's. Now we will add a histogram to the TSelector code. Several code lines have to be added to the TSelector code files s0.C and s0.h . To s0.h make the following additions: after existing include statements add: #include <TH1F.h> After class s0 definition: class s0 : public TSelector { public : add TH1F *e; To s0.C make the following additions: After entry: void s0::SlaveBegin(TTree * /*tree*/) { add e = new TH1F(\"e\", \"e\", 1000, -199.0, 1200.0); After Process entry: Bool_t s0::Process(Long64_t entry) { add GetEntry(entry); e->Fill(Energy); After terminate entry: void s0::Terminate() { add TFile f(\"histograms.root\",\"RECREATE\"); f.WriteObject(e,\"Energy\"); f.Close(); We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-5 ----------------------------------------- Now create the new script files for Step 2: create run-root-2.cmd : universe=vanilla executable=run-root-2.sh transfer_input_files = s0.C,s0.h,run-root-2.C,t00.root,t01.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root-2.log transfer_output_files = root-2.out,histograms.root output=run-root-2.out.$(Cluster).$(Process) error=run-root-2.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-5 ---------------------------------------------------- Create run-root-2.sh : #!/bin/bash root -b < run-root-2.C > root-2.out It has to be made executable, by use of the chmod Linux command: chmod +x run-root-2.sh Create run-root-2.C .L s0.C++ { //Load and run TSelector s0 *s = new s0(); TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Process(s); } We can test the root job on the local machine by executing the script to run root: ./run-root-2.sh We will not submit grid jobs during this exercise. So we will skip running condor. ----------------------------- Skip from here-6 ----------------------------------------- If this works, we can process the data files t00.root and t01.root on the Grid with our new command script run-root-2.cmd . This can be done with command: condor_submit run-root-2.cmd Once your job has finished, you again need to copy that file into your public directory, so that you can download it to your local desktop: cp histograms.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms.root ----------------------------------------------- Skip to here-6 ---------------------------------------------------- You can look at the output histogram file: histograms.root with TBrowser b as before, in your local terminal window.","title":"Step 3: Make TSelector"},{"location":"ASP2022/PH_DT2017/","text":"Polar Hackathon 2017 High-Level Cirriculum Overview Wednesday Session 1 Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Wednesday Session 2 Workflows and distributed environments Thursday Sessions Containers Jetstream demo and discussion Materials Materials Page","title":"Polar Hackathon 2017"},{"location":"ASP2022/PH_DT2017/#polar-hackathon-2017","text":"","title":"Polar Hackathon 2017"},{"location":"ASP2022/PH_DT2017/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"ASP2022/PH_DT2017/#wednesday-session-1","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Wednesday Session 1"},{"location":"ASP2022/PH_DT2017/#wednesday-session-2","text":"Workflows and distributed environments","title":"Wednesday Session 2"},{"location":"ASP2022/PH_DT2017/#thursday-sessions","text":"Containers Jetstream demo and discussion","title":"Thursday Sessions"},{"location":"ASP2022/PH_DT2017/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2022/PH_Materials/","text":"Polar Hackathon School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Wednesday - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Wednesday - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday - Computational Infrastructures - Session 3 Lecture 4 - Slides Containers with Singularity Jetstream Demo Possible if Time - Workflows Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow Extras Discipline specific tutorials Contact information Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Polar Hackathon School Materials"},{"location":"ASP2022/PH_Materials/#polar-hackathon-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Polar Hackathon School Materials"},{"location":"ASP2022/PH_Materials/#wednesday-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Wednesday - Computational Infrastructures - Session 1"},{"location":"ASP2022/PH_Materials/#wednesday-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Wednesday - Computational Infrastructures - Session 2"},{"location":"ASP2022/PH_Materials/#thursday-computational-infrastructures-session-3","text":"Lecture 4 - Slides Containers with Singularity Jetstream Demo","title":"Thursday - Computational Infrastructures - Session 3"},{"location":"ASP2022/PH_Materials/#possible-if-time-workflows","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow","title":"Possible if Time - Workflows"},{"location":"ASP2022/PH_Materials/#extras","text":"Discipline specific tutorials","title":"Extras"},{"location":"ASP2022/PH_Materials/#contact-information","text":"Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"ASP2022/School/","text":"Data Trieste 2017 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Friday AM Session Introduction to cloud environemts Close out and resources for further collaboration Detailed Schedule Thursday 20-July 2017 Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick Friday 21-July 2017 Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch Materials Materials Page","title":"Data Trieste 2017"},{"location":"ASP2022/School/#data-trieste-2017","text":"","title":"Data Trieste 2017"},{"location":"ASP2022/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"ASP2022/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"ASP2022/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Thursday PM Session"},{"location":"ASP2022/School/#friday-am-session","text":"Introduction to cloud environemts Close out and resources for further collaboration","title":"Friday AM Session"},{"location":"ASP2022/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"ASP2022/School/#thursday-20-july-2017","text":"Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick","title":"Thursday 20-July 2017"},{"location":"ASP2022/School/#friday-21-july-2017","text":"Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch","title":"Friday 21-July 2017"},{"location":"ASP2022/School/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2024/","text":"Placeholder for ASP 2024 Lessons","title":"Index"},{"location":"ASP2024/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will need your Gmail or GitHub credentials for this session. You might want to refer to the online Condor manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 23.7.2, which is a recent version of Condor. Where you will work Today you will log into https://notebook.ospool.osg-htc.org/hub/login for all of your exercises: Login on submission node using a web browser: https://notebook.ospool.osg-htc.org/hub/login Click on 'Sign in with CILogon'. Select the Identity Provider Google (or GitHub). Click 'Log On'. Log into your Google account (or GitHub). Click the 'Server Option' 'Data Science'. Click 'Start'. This will take some time. In the 'Launcher' window, click on 'Terminal' (bottom left). When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into a terminal in the ospool notebook above.","title":"High Throughput Computing and Condor Introduction"},{"location":"ASP2024/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"ASP2024/01-Introduction/#preliminaries","text":"You will need your Gmail or GitHub credentials for this session. You might want to refer to the online Condor manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"ASP2024/01-Introduction/#which-condor","text":"We will be using Condor 23.7.2, which is a recent version of Condor.","title":"Which Condor?"},{"location":"ASP2024/01-Introduction/#where-you-will-work","text":"Today you will log into https://notebook.ospool.osg-htc.org/hub/login for all of your exercises: Login on submission node using a web browser: https://notebook.ospool.osg-htc.org/hub/login Click on 'Sign in with CILogon'. Select the Identity Provider Google (or GitHub). Click 'Log On'. Log into your Google account (or GitHub). Click the 'Server Option' 'Data Science'. Click 'Start'. This will take some time. In the 'Launcher' window, click on 'Terminal' (bottom left). When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"ASP2024/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into a terminal in the ospool notebook above.","title":"The Exercises"},{"location":"ASP2024/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into https://notebook.ospool.osg-htc.org/hub/login with your Gmail or GitHub account. $ hostname jupyter-...-40gmail-2ecom Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 23.7.2 2024-05-16 BuildID: 733409 PackageID: 23.7.2-0.2 GitSHA: 585ec167 $ $CondorPlatform: X86_64-Ubuntu_22.04 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not necessarily the computer we're running on. It was built on Ubuntu 22.04, but you might notice that we're running on Ubuntu 22.04.4, which is a slightly newer version. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Ubuntu 22.04.4 LTS \\n \\l Or you can run: $ lsb_release -a o No LSB modules are available. Distributor ID: Ubuntu Description: Ubuntu 22.04.4 LTS Release: 22.04 Codename: jammy Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_config.local config.d ganglia.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /home/jovyan/.condor/local # at: /etc/condor/condor_config.local, line 2 # raw: LOCAL_DIR = $ENV(HOME)/.condor/local $ s -CF /home/jovyan/.condor/local/ cred_dir/ execute/ lock/ log/ run/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep jovyan 17 0.0 0.0 23844 7240 ? Ss 19:32 0:00 condor_master jovyan 18 0.0 0.0 7620 2372 ? S 19:32 0:00 \\_ condor_procd -A /home/jovyan/.condor/local/run/procd_pipe -L /home/jovyan/ jovyan 19 0.0 0.0 18200 8284 ? Ss 19:32 0:00 \\_ condor_shared_port jovyan 20 0.0 0.0 20180 9640 ? Ss 19:32 0:00 \\_ condor_collector jovyan 21 0.0 0.0 20688 10028 ? Ss 19:32 0:00 \\_ condor_negotiator jovyan 22 0.0 0.0 21320 10104 ? Ss 19:32 0:00 \\_ condor_schedd jovyan 23 0.0 0.0 21136 10172 ? Ss 19:32 0:00 \\_ condor_startd For this version of Condor there are these processes running: the condor_master, the condor_schedd, the condor_procd, the condor_collector, the condor_negotiator, and condor_shared_port. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -nobatch -- Schedd: jovyan@jupyter-email-3ahorst-2eseverini-40gmail-2ecom : <127.0.0.1:9618?... @ 07/02/24 19:44:46 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for jovyan: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q (Will not work on this ospool training machine.) condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@jupyter-email-3ahorst-2eseverini-40gmail-2ecom LINUX X86_64 Unclaimed Idle 0.000 257750 0+00:14:02 Total Owner Claimed Unclaimed Matched Preempting Drain Backfill BkIdle X86_64/LINUX 1 0 0 1 0 0 0 0 0 Total 1 0 0 1 0 0 0 0 0 ... Let's look at exactly what you can see (this will look differently on different condor pools): Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"ASP2024/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"ASP2024/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"ASP2024/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into https://notebook.ospool.osg-htc.org/hub/login with your Gmail or GitHub account. $ hostname jupyter-...-40gmail-2ecom","title":"Login to the Condor submit computer"},{"location":"ASP2024/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 23.7.2 2024-05-16 BuildID: 733409 PackageID: 23.7.2-0.2 GitSHA: 585ec167 $ $CondorPlatform: X86_64-Ubuntu_22.04 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not necessarily the computer we're running on. It was built on Ubuntu 22.04, but you might notice that we're running on Ubuntu 22.04.4, which is a slightly newer version.","title":"Looking at our Condor installation"},{"location":"ASP2024/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Ubuntu 22.04.4 LTS \\n \\l Or you can run: $ lsb_release -a o No LSB modules are available. Distributor ID: Ubuntu Description: Ubuntu 22.04.4 LTS Release: 22.04 Codename: jammy Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_config.local config.d ganglia.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /home/jovyan/.condor/local # at: /etc/condor/condor_config.local, line 2 # raw: LOCAL_DIR = $ENV(HOME)/.condor/local $ s -CF /home/jovyan/.condor/local/ cred_dir/ execute/ lock/ log/ run/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep jovyan 17 0.0 0.0 23844 7240 ? Ss 19:32 0:00 condor_master jovyan 18 0.0 0.0 7620 2372 ? S 19:32 0:00 \\_ condor_procd -A /home/jovyan/.condor/local/run/procd_pipe -L /home/jovyan/ jovyan 19 0.0 0.0 18200 8284 ? Ss 19:32 0:00 \\_ condor_shared_port jovyan 20 0.0 0.0 20180 9640 ? Ss 19:32 0:00 \\_ condor_collector jovyan 21 0.0 0.0 20688 10028 ? Ss 19:32 0:00 \\_ condor_negotiator jovyan 22 0.0 0.0 21320 10104 ? Ss 19:32 0:00 \\_ condor_schedd jovyan 23 0.0 0.0 21136 10172 ? Ss 19:32 0:00 \\_ condor_startd For this version of Condor there are these processes running: the condor_master, the condor_schedd, the condor_procd, the condor_collector, the condor_negotiator, and condor_shared_port. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"ASP2024/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -nobatch -- Schedd: jovyan@jupyter-email-3ahorst-2eseverini-40gmail-2ecom : <127.0.0.1:9618?... @ 07/02/24 19:44:46 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for jovyan: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"ASP2024/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q (Will not work on this ospool training machine.) condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"ASP2024/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"ASP2024/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@jupyter-email-3ahorst-2eseverini-40gmail-2ecom LINUX X86_64 Unclaimed Idle 0.000 257750 0+00:14:02 Total Owner Claimed Unclaimed Matched Preempting Drain Backfill BkIdle X86_64/LINUX 1 0 0 1 0 0 0 0 0 Total 1 0 0 1 0 0 0 0 0 ... Let's look at exactly what you can see (this will look differently on different condor pools): Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"ASP2024/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"ASP2024/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. Let's create a directory to perform the condor tests. Execute the following two commands in your Linux shell: $ mkdir -p ~/condor-test $ cd ~/condor-test Next, create a file called simple.c using your favorite editor. Use your preferred text editor to create this C program. If you don't have one, use 'nano', as shown below. The file will be in the condor-test directory. In that file, put the following text. Copy and paste is a good choice: $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } To copy and paste code into nano on ospool, it is convenient to use key strokes: CTRL-c (windows), command-c (mac) to copy; and CTRL-v (windows), command-v (mac) to paste. To save your file and exit nano, type: CTRL-x, y, return. Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 2056. Now, watch your job run (insert your username in the command below instead of YOUR_USER_ID . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (2056.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (2056.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (2056.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (2056.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 34. $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q YOUR_USERID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"ASP2024/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"ASP2024/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"ASP2024/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. Let's create a directory to perform the condor tests. Execute the following two commands in your Linux shell: $ mkdir -p ~/condor-test $ cd ~/condor-test Next, create a file called simple.c using your favorite editor. Use your preferred text editor to create this C program. If you don't have one, use 'nano', as shown below. The file will be in the condor-test directory. In that file, put the following text. Copy and paste is a good choice: $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } To copy and paste code into nano on ospool, it is convenient to use key strokes: CTRL-c (windows), command-c (mac) to copy; and CTRL-v (windows), command-v (mac) to paste. To save your file and exit nano, type: CTRL-x, y, return. Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"ASP2024/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 2056. Now, watch your job run (insert your username in the command below instead of YOUR_USER_ID . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (2056.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (2056.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (2056.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (2056.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"ASP2024/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 34. $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q YOUR_USERID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"ASP2024/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"ASP2024/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"ASP2024/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"ASP2024/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"ASP2024/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"ASP2024/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 23.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 23.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 23.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 23.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 23.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 23.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 23.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 23.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history YOUR_USER_ID For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"ASP2024/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"ASP2024/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"ASP2024/04-TipsandTricks/#tips-for-condor_q","text":"Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 23.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 23.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 23.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 23.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 23.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 23.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 23.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 23.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"ASP2024/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"ASP2024/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history YOUR_USER_ID For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"ASP2024/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you would like to a write program in the Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 You will need to make this shell script executable with chmod +x simple.sh , and then you can run it with ./simple.sh 3 4 . Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"ASP2024/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"ASP2024/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you would like to a write program in the Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 You will need to make this shell script executable with chmod +x simple.sh , and then you can run it with ./simple.sh 3 4 . Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"ASP2024/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"ASP2024/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask us. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Invoke R with the proper environment /opt/conda/bin/R --slave --vanilla < $1 You could easily execute this on OSpool by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) queue","title":"Running a job with R"},{"location":"ASP2024/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"ASP2024/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"ASP2024/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"ASP2024/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"ASP2024/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"ASP2024/06-RJob/#setup","text":"You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask us. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Invoke R with the proper environment /opt/conda/bin/R --slave --vanilla < $1 You could easily execute this on OSpool by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"ASP2024/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"ASP2024/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) queue","title":"The answer"},{"location":"ASP2024/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"ASP2024/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"ASP2024/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"ASP2024/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"ASP2024/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"ASP2024/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot . Downloading the needed executables Since your training VMs don't have the goatbrot executable needed for this exercise, we will need to download it first. Execute the following commands to do this: $ mkdir bin $ cd bin $ wget https://www.nhn.ou.edu/~hs/tmp/goatbrot $ chmod +x goatbrot $ cd .. A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif We need to download the GIF file from your training VM to your local desktop. To do so, find the file mandle.gif in the list of files and directories in the side bar to the left of your terminal window. Right-click on it and select Download, and download it to your local desktop. Point Browser at the file URL: open a Terminal on your local CentOS VM, and then type in there: firefox Downloads/mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run each goatbroat in parallel in our cluster. Here's an example you can run by hand. (This is back in your first terminal, where you are logged into the osgconnect machine.) Run goatbroat 4 times : $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Find the file mandle.gif in your side bar again. 2. Download it and display it in Firefox.","title":"A brief detour through the Mandlebrot set"},{"location":"ASP2024/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot .","title":"A brief detour through the Mandlebrot set"},{"location":"ASP2024/08-Mandlebrot/#downloading-the-needed-executables","text":"Since your training VMs don't have the goatbrot executable needed for this exercise, we will need to download it first. Execute the following commands to do this: $ mkdir bin $ cd bin $ wget https://www.nhn.ou.edu/~hs/tmp/goatbrot $ chmod +x goatbrot $ cd ..","title":"Downloading the needed executables"},{"location":"ASP2024/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif We need to download the GIF file from your training VM to your local desktop. To do so, find the file mandle.gif in the list of files and directories in the side bar to the left of your terminal window. Right-click on it and select Download, and download it to your local desktop. Point Browser at the file URL: open a Terminal on your local CentOS VM, and then type in there: firefox Downloads/mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"ASP2024/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run each goatbroat in parallel in our cluster. Here's an example you can run by hand. (This is back in your first terminal, where you are logged into the osgconnect machine.) Run goatbroat 4 times : $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"ASP2024/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Find the file mandle.gif in your side bar again. 2. Download it and display it in Firefox.","title":"Try it!"},{"location":"ASP2024/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file (called submit ) that has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. Make sure you are in the condor-test directory that you used before. cd ~/condor-test . If the directory condor-test was deleted, you will have to create it again and create and compile the simple program again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. Click the + sign just to the right of the title 'Terminal 1' -- NOT the x! -- in order to open additional terminals. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . JOB simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue. As before, replace YOUR_USER_ID with your username. (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q YOUR_USER_ID -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here? Challenge What is the scheduler universe? Why does DAGMan use it?","title":"Coordinating set of jobs: A simple DAG"},{"location":"ASP2024/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"ASP2024/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"ASP2024/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"ASP2024/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file (called submit ) that has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. Make sure you are in the condor-test directory that you used before. cd ~/condor-test . If the directory condor-test was deleted, you will have to create it again and create and compile the simple program again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. Click the + sign just to the right of the title 'Terminal 1' -- NOT the x! -- in order to open additional terminals. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . JOB simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue. As before, replace YOUR_USER_ID with your username. (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q YOUR_USER_ID -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"ASP2024/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"ASP2024/09-SimpleDAG/#challenge","text":"What is the scheduler universe? Why does DAGMan use it?","title":"Challenge"},{"location":"ASP2024/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. You have placed the goatbrot executable in your bin directory: ~/bin/goatbrot . Condor does not deal well with ~/ as the home directory, so we will use the full path /home/jovyan/bin/ instead in the submit scripts, which goes to the same directory. goatbrot1.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on the VM. universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch -n 1 condor_q YOUR_USER_ID -nobatch To quit watch command, press Ctrl-c . Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can download it again from the left side bar, and then display it with Firefox. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"ASP2024/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"ASP2024/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"ASP2024/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. You have placed the goatbrot executable in your bin directory: ~/bin/goatbrot . Condor does not deal well with ~/ as the home directory, so we will use the full path /home/jovyan/bin/ instead in the submit scripts, which goes to the same directory.","title":"Make your job submission files"},{"location":"ASP2024/10-ComplexDAG/#goatbrot1sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"ASP2024/10-ComplexDAG/#goatbrot2sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"ASP2024/10-ComplexDAG/#goatbrot3sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"ASP2024/10-ComplexDAG/#goatbrot4sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"ASP2024/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on the VM. universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue","title":"montage.sub"},{"location":"ASP2024/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"ASP2024/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"ASP2024/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch -n 1 condor_q YOUR_USER_ID -nobatch To quit watch command, press Ctrl-c . Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can download it again from the left side bar, and then display it with Firefox. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"ASP2024/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"ASP2024/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this: universe = vanilla executable = /usr/bin/montage arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the rescue DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Handling a DAG that fails"},{"location":"ASP2024/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"ASP2024/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this: universe = vanilla executable = /usr/bin/montage arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the rescue DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"ASP2024/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"ASP2024/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"ASP2024/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"ASP2024/12-VariableSubstitution/#declare-your-variables","text":"First declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"ASP2024/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"ASP2024/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"ASP2024/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"ASP2024/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"ASP2024/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"ASP2024/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"ASP2024/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"ASP2024/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"ASP2024/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"ASP2024/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"ASP2024/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"ASP2024/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"ASP2024/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"ASP2024/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"ASP2024/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"ASP2024/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"ASP2024/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"ASP2024/ASP2024_Materials/","text":"ASP 2024 Materials We will be using OSG Training Pool for this set of sessions. Please visit https://notebook.ospool.osg-htc.org/hub/login to log in. Wednesday Afternoon - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Intro-Slides Lecture1-Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Wednesday Afternoon - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday Morning - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Thursday Morning - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Thursday Afternoon - Computational Infrastructures - Session 5 ATLAS Analysis Example Lecture ATLAS Analysis Example Exercise Computational Infrastructures Wrap Up - Slides Contact information Rob Quick - rquick@iu.edu Horst Severini - severini@ou.edu Patrick Skubic - pskubic@ou.edu Julia Gray - julia.ann.gray@gmail.com Jae Yu - jaehoonyu@uta.edu Chris Walker - walker@nhn.ou.edu If you want long\u2010term grid access, you can go to http://www.osgconnect.net/ and sign up","title":"ASP 2024 Materials"},{"location":"ASP2024/ASP2024_Materials/#asp-2024-materials","text":"We will be using OSG Training Pool for this set of sessions. Please visit https://notebook.ospool.osg-htc.org/hub/login to log in.","title":"ASP 2024 Materials"},{"location":"ASP2024/ASP2024_Materials/#wednesday-afternoon-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Intro-Slides Lecture1-Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Wednesday Afternoon - Computational Infrastructures - Session 1"},{"location":"ASP2024/ASP2024_Materials/#wednesday-afternoon-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Wednesday Afternoon - Computational Infrastructures - Session 2"},{"location":"ASP2024/ASP2024_Materials/#thursday-morning-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Thursday Morning - Computational Infrastructures - Session 3"},{"location":"ASP2024/ASP2024_Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"ASP2024/ASP2024_Materials/#thursday-morning-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Thursday Morning - Computational Infrastructures - Session 4"},{"location":"ASP2024/ASP2024_Materials/#thursday-afternoon-computational-infrastructures-session-5","text":"ATLAS Analysis Example Lecture ATLAS Analysis Example Exercise Computational Infrastructures Wrap Up - Slides","title":"Thursday Afternoon - Computational Infrastructures - Session 5"},{"location":"ASP2024/ASP2024_Materials/#contact-information","text":"Rob Quick - rquick@iu.edu Horst Severini - severini@ou.edu Patrick Skubic - pskubic@ou.edu Julia Gray - julia.ann.gray@gmail.com Jae Yu - jaehoonyu@uta.edu Chris Walker - walker@nhn.ou.edu If you want long\u2010term grid access, you can go to http://www.osgconnect.net/ and sign up","title":"Contact information"},{"location":"ASP2024/ASP2024_Schedule/","text":"ASP 2024 High-Level Curriculum Overview Wednesday PM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday AM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Brief Introduction to clouds and containers Thursday PM Session ATLAS Analysis Example Close out and resources for further collaboration Detailed Schedule Wednesday 17-July 2024 Time Description Instructor 14:30 Welcome and the Landscape of Research Computing Jae Yu 15:15 Exercise - UNIX Refresher, Running simple Condor jobs All 16:00 Coffee Break 16:30 Profiling your application and finding a home for your workflow Pat Skubic 17:15 Exercise - Running scripting and R jobs with Condor All Thursday 18-July 2024 Time Description Instructor 09:30 Workflows and distributed environments Horst Severini 10:15 Exercise - DAGMan All 11:00 Coffee Break 11:30 A Brief Introduction to Clouds and Containers Horst Severini 12:15 Exercise - Using Containers on OSG and Discpline Specfic Tutorials All 13:00 Lunch 14:30 ATLAS Analysis Example Pat Skubic 14:30 ATLAS Analysis Exercises All 16:00 Coffee Break 16:30 More Exercise Time All 17:30 Closing Thoughts All Materials Materials Page","title":"ASP 2024 Schedule"},{"location":"ASP2024/ASP2024_Schedule/#asp-2024","text":"","title":"ASP 2024"},{"location":"ASP2024/ASP2024_Schedule/#high-level-curriculum-overview","text":"","title":"High-Level Curriculum Overview"},{"location":"ASP2024/ASP2024_Schedule/#wednesday-pm-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Wednesday PM Session"},{"location":"ASP2024/ASP2024_Schedule/#thursday-am-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Brief Introduction to clouds and containers","title":"Thursday AM Session"},{"location":"ASP2024/ASP2024_Schedule/#thursday-pm-session","text":"ATLAS Analysis Example Close out and resources for further collaboration","title":"Thursday PM Session"},{"location":"ASP2024/ASP2024_Schedule/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"ASP2024/ASP2024_Schedule/#wednesday-17-july-2024","text":"Time Description Instructor 14:30 Welcome and the Landscape of Research Computing Jae Yu 15:15 Exercise - UNIX Refresher, Running simple Condor jobs All 16:00 Coffee Break 16:30 Profiling your application and finding a home for your workflow Pat Skubic 17:15 Exercise - Running scripting and R jobs with Condor All","title":"Wednesday 17-July 2024"},{"location":"ASP2024/ASP2024_Schedule/#thursday-18-july-2024","text":"Time Description Instructor 09:30 Workflows and distributed environments Horst Severini 10:15 Exercise - DAGMan All 11:00 Coffee Break 11:30 A Brief Introduction to Clouds and Containers Horst Severini 12:15 Exercise - Using Containers on OSG and Discpline Specfic Tutorials All 13:00 Lunch 14:30 ATLAS Analysis Example Pat Skubic 14:30 ATLAS Analysis Exercises All 16:00 Coffee Break 16:30 More Exercise Time All 17:30 Closing Thoughts All","title":"Thursday 18-July 2024"},{"location":"ASP2024/ASP2024_Schedule/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2024/AnalysisExample/","text":"ATLAS Analysis Example Introduction Root may be run in batch mode on the grid to analyze large data samples. This example creates simulated data in root format using trees and performs analysis on the simulated data by means of processing on the grid. This example is based on a demo developed by OU programmer Chris Walker. Prerequisite Open a new Terminal on your local desktop. NOTE: You are no longer using the browser based terminal now, but the Terminal on your CentOS VM, just like you did to display mandle.gif with firefox. Make a directory for this exercise $ mkdir -p analysis_example $ cd analysis_example Again the $ sign at the beginning of the commands to execute is the command prompt , so it should not be entered as part of the command. Simple Analysis Example Step 1: Create simulated data using the grid Note: Since the new training VMs on OSpool do not support running root, we will run root on the local desktops instead of using condor. So we will not need the condor submit scripts below but we will leave the instructions for them for future reference. Now in your test directory we will create the three files: run-root.cmd , run-root.sh , and run-root.C with the contents given below. This may require running an editor such as emacs or nano on your local desktop. We will not submit grid jobs so the \"run-root.cmd\" script is not needed for this exercise. ----------------------------- Skip from here-1 ----------------------------------------- First, we will utilize a simple command script to submit the grid jobs. It is run-root.cmd : universe=vanilla executable=run-root.sh transfer_input_files = run-root.C transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root.log transfer_output_files = root.out,t00.root,t01.root output=run-root.out.$(Cluster).$(Process) error=run-root.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-1 ---------------------------------------------------- Note that the executable script is: run-root.sh which is as follows: #!/bin/bash # setup source /cvmfs/sft.cern.ch/lcg/views/setupViews.sh LCG_105a x86_64-ubuntu2204-gcc11-opt # execute root -b < run-root.C > root.out This script runs Root in batch mode and executes input macro run-root.C and produces output that is routed to file root.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-root.sh The macro run-root.C consists of the following code: { // create files containing simulated data TRandom g; char c[256]; for ( int j = 0 ; j < 2 ; j++ ){ sprintf(c,\"t%2.2d.root\\000\",j); TFile f(c,\"RECREATE\",\"MyFile\", 0/*no compression*/); TTree *t = new TTree(\"t0\",\"t0\"); Int_t Run; TBranch * b_Run = t->Branch(\"Run\",&Run); Int_t Event; TBranch * b_Event = t->Branch(\"Event\",&Event); Float_t Energy; TBranch * b_Energy = t->Branch(\"Energy\",&Energy); Run = j; for( Event = 0 ; Event < 100 ; Event++ ){ Energy = g.Gaus(500.0 , 200.0); t->Fill(); } f.Write(); f.Close(); } } .q We will not submit grid jobs during this exercise. So we will skip to running root. ----------------------------- Skip from here-2 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-root.cmd It can be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-root.log , and output file: root.out , and the files containing the simulated data: t00.root , t01.root in your test directory. You need to copy these files into your public directory, so that you can download it to your local desktop: $ cp t0*.root ~/public/ Now open a different terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/t00.root http://stash.osgconnect.net/~YOUR_USER_ID/t01.root ----------------------------------------------- Skip to here-2 ---------------------------------------------------- Execute the script to run root: ./run-root.sh You can then inspect the contents of t00.root and t01.root by running root in your current directory in the local terminal window: $ root t00.root And then the Root command: TBrowser b With the TBrowser you can plot the simulated data in branch Energy as well as the other branches. Double click on the name of the root files, and then on the variables you would like to plot. Each data file contains a TTree named t0 . You can plot the contents of all (in this example both) data file TTree's by using the TChain method as follows: In root execute the following commands: TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Draw(\"Energy\"); When you are done with this, you can quit root again with the command .q <Return> . Step 2: Analyze Real Data We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-3 ----------------------------------------- The grid job can be submitted using: Now we want to have a look at a real live ATLAS root file. For this, go back to the remote terminal window on osgconnect. You will need a new condor submit script called run-z.cmd : universe=vanilla executable=run-z.sh transfer_input_files = readEvents.C,muons.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-z.log transfer_output_files = root-z.out,histograms-z.root output=run-z.out.$(Cluster).$(Process) error=run-z.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-3 ---------------------------------------------------- The new executable script you need for this job is: run-z.sh which is as follows: #!/bin/bash # setup source /cvmfs/sft.cern.ch/lcg/views/setupViews.sh LCG_105a x86_64-ubuntu2204-gcc11-opt # execute root -b -q readEvents.C+ > root-z.out This script runs root in batch mode and executes input macro readEvents.C and produces output that is routed to file root-z.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-z.sh The macro readEvents.C consists of the following code: #include \"TFile.h\" #include \"TTree.h\" #include \"TCanvas.h\" #include \"TH1F.h\" #include \"iostream\" //#include \"TLorentzVector.h\" using namespace std; void readEvents(){ // load the ROOT ntuple file TFile * f = new TFile(\"muons.root\"); TTree *tree = (TTree *) f->Get(\"POOLCollectionTree\"); int nEntries = tree->GetEntries(); cout << \"There are \" << nEntries << \" entries in your ntuple\" << endl; // create local variables for the tree's branches UInt_t NLooseMuons; Float_t LooseMuonsEta1; Float_t LooseMuonsPhi1; Float_t LooseMuonsPt1; Float_t LooseMuonsEta2; Float_t LooseMuonsPhi2; Float_t LooseMuonsPt2; // set the tree's branches to the local variables tree->SetBranchAddress(\"NLooseMuon\", &NLooseMuons); tree->SetBranchAddress(\"LooseMuonEta1\", &LooseMuonsEta1); tree->SetBranchAddress(\"LooseMuonPhi1\", &LooseMuonsPhi1); tree->SetBranchAddress(\"LooseMuonPt1\", &LooseMuonsPt1); tree->SetBranchAddress(\"LooseMuonEta2\", &LooseMuonsEta2); tree->SetBranchAddress(\"LooseMuonPhi2\", &LooseMuonsPhi2); tree->SetBranchAddress(\"LooseMuonPt2\", &LooseMuonsPt2); // declare some histograms TH1F *muPt1 = new TH1F(\"muPt1\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx1 = new TH1F(\"muPx1\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy1 = new TH1F(\"muPy1\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz1 = new TH1F(\"muPz1\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta1 = new TH1F(\"muEta1\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi1 = new TH1F(\"muPhi1\", \";#phi;Events\", 50, -4, 4); TH1F *muE1 = new TH1F(\"muE1\", \";Energy;Events\", 50, 0, 200); TH1F *muPt2 = new TH1F(\"muPt2\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx2 = new TH1F(\"muPx2\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy2 = new TH1F(\"muPy2\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz2 = new TH1F(\"muPz2\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta2 = new TH1F(\"muEta2\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi2 = new TH1F(\"muPhi2\", \";#phi;Events\", 50, -4, 4); TH1F *muE2 = new TH1F(\"muE2\", \";Energy;Events\", 50, 0, 200); TH1F *zPt = new TH1F(\"zPt\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *zPx = new TH1F(\"zPx\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *zPy = new TH1F(\"zPy\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *zPz = new TH1F(\"zPz\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz //TH1F *zEta = new TH1F(\"zEta\", \";#eta;Events\", 50, -3, 3); //TH1F *zPhi = new TH1F(\"zPhi\", \";#phi;Events\", 50, -4, 4); TH1F *zE = new TH1F(\"zE\", \";Energy;Events\", 50, 0, 200); TH1F *zMass = new TH1F(\"zMass\", \";Mass;Events\", 50, 0, 200); // loop over each entry (event) in the tree for( int entry=0; entry < nEntries; entry++ ){ if( entry%10000 * 0 ) cout << \"Entry:\" << entry << endl; // check that the event is read properly int entryCheck = tree->GetEntry( entry ); if( entryCheck <= 0 ){ continue; } // only look at events containing at least 2 leptons if(NLooseMuons < 2) continue; // require the leptons to have some transverse momentum if(abs(LooseMuonsPt1) *0.001 < 20 || abs(LooseMuonsPt2) *0.001 < 20 ) continue; // make a LorentzVector from the muon //TLorentzVector Muons1; // Muons1.SetPtEtaPhiM(fabs(LooseMuonsPt1), LooseMuonsEta1, LooseMuonsPhi1, 0); // print out the details of an electron every so often if( entry%10000 * 0 ){ cout << \"Muons pt1: \" << LooseMuonsPt1 << \" eta: \" << LooseMuonsEta1 << \" phi \" << LooseMuonsPhi1 << endl; cout << \"Muons pt2: \" << LooseMuonsPt2 << \" eta: \" << LooseMuonsEta2 << \" phi \" << LooseMuonsPhi2 << endl; } //calculation of muon energy Double_t muonMass = 0.0; // assume the mass of the muon is negligible Double_t muonPx1 = abs(LooseMuonsPt1)*cos(LooseMuonsPhi1); Double_t muonPy1 = abs(LooseMuonsPt1)*sin(LooseMuonsPhi1); Double_t muonPz1 = abs(LooseMuonsPt1)*sinh(LooseMuonsEta1); Double_t muonEnergy1 = sqrt (muonPx1*muonPx1 + muonPy1*muonPy1 + muonPz1*muonPz1 + muonMass*muonMass); Double_t muonPx2 = abs(LooseMuonsPt2)*cos(LooseMuonsPhi2); Double_t muonPy2 = abs(LooseMuonsPt2)*sin(LooseMuonsPhi2); Double_t muonPz2 = abs(LooseMuonsPt2)*sinh(LooseMuonsEta2); Double_t muonEnergy2 = sqrt (muonPx2*muonPx2 + muonPy2*muonPy2 + muonPz2*muonPz2 + muonMass*muonMass); Double_t zCompX = muonPx1 + muonPx2; Double_t zCompY = muonPy1 + muonPy2; Double_t zLongi = muonPz1 + muonPz2; Double_t zPerp = sqrt (zCompX*zCompX + zCompY*zCompY); Double_t zEnergy = muonEnergy1 + muonEnergy2; Double_t zM = sqrt (zEnergy*zEnergy -zCompX*zCompX -zCompY*zCompY -zLongi*zLongi); // fill our histograms muPt1->Fill((LooseMuonsPt1)*0.001); // in GeV muEta1->Fill(LooseMuonsEta1); muPhi1->Fill(LooseMuonsPhi1); muPx1->Fill( muonPx1*0.001); // in GeV muPy1->Fill( muonPy1*0.001); // in GeV muPz1->Fill( muonPz1*0.001); // in GeV muE1->Fill(muonEnergy1*0.001); // in GeV muPt2->Fill((LooseMuonsPt2)*0.001); // in GeV muEta2->Fill(LooseMuonsEta2); muPhi2->Fill(LooseMuonsPhi2); muPx2->Fill( muonPx2*0.001); // in GeV muPy2->Fill( muonPy2*0.001); // in GeV muPz2->Fill( muonPz2*0.001); // in GeV muE2->Fill(muonEnergy2*0.001); // in GeV zPt->Fill( zPerp*0.001); // in GeV zPx->Fill( zCompX*0.001); // in GeV zPy->Fill( zCompY*0.001); // in GeV zPz->Fill( zLongi*0.001); // in GeV zE->Fill( zEnergy*0.001); // in GeV zMass->Fill(zM*0.001); // in GeV } // draw the eta distribution zMass->Draw(); // make a ROOT output file to store your histograms TFile *outFile = new TFile(\"histograms-z.root\", \"recreate\"); muPt1->Write(); muEta1->Write(); muPhi1->Write(); muE1->Write(); muPx1->Write(); muPy1->Write(); muPz1->Write(); muPt2->Write(); muEta2->Write(); muPhi2->Write(); muE2->Write(); muPx2->Write(); muPy2->Write(); muPz2->Write(); zPt->Write(); zE->Write(); zPx->Write(); zPy->Write(); zPz->Write(); zMass->Write(); outFile->Close(); } We will not submit grid jobs during this exercise. So we will skip to run root. ----------------------------- Skip from here-4 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-z.cmd It can again be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-z.log , and output file: root-z.out , and the files containing the simulated data: histograms-z.root in your test directory. You again need to copy that file into your public directory, so that you can download it to your local desktop: $ cp histograms-z.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms-z.root ----------------------------------------------- Skip to here-4 ---------------------------------------------------- Setup a soft link to the input data file, muons.root, and execute the script to run root: ln -s /opt/data/muons.root . ./run-z.sh You can inspect the contents of histograms-z.root by running Root (i.e., root histograms-z.root ) in your current directory in your local terminal window: $ root histograms-z.root And then using the Root command: TBrowser b With the TBrowser you can plot the variables in the root file. Double click on histograms-z.root , and then on the variables to plot them. Step 3: Make TSelector Now let's go back to the files created in step 1, in the local terminal window. Start root in your test directory with the following commands: $ root -b And then execute the following commands: TFile f(\"t00.root\"); t0->MakeSelector(\"s0\",\"=legacy\"); f.Close(); .q This will create files s0.C and s0.h in your test directory that contain code corresponding to the definition of the TTree t0 . This code can be used to process files containing data in these TTree's. Now we will add a histogram to the TSelector code. Several code lines have to be added to the TSelector code files s0.C and s0.h . To s0.h make the following additions: after existing include statements add: #include <TH1F.h> After class s0 definition: class s0 : public TSelector { public : add TH1F *e; To s0.C make the following additions: After entry: void s0::SlaveBegin(TTree * /*tree*/) { add e = new TH1F(\"e\", \"e\", 1000, -199.0, 1200.0); After Process entry: Bool_t s0::Process(Long64_t entry) { add GetEntry(entry); e->Fill(Energy); After terminate entry: void s0::Terminate() { add TFile f(\"histograms.root\",\"RECREATE\"); f.WriteObject(e,\"Energy\"); f.Close(); We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-5 ----------------------------------------- Now create the new script files for Step 2: create run-root-2.cmd : universe=vanilla executable=run-root-2.sh transfer_input_files = s0.C,s0.h,run-root-2.C,t00.root,t01.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root-2.log transfer_output_files = root-2.out,histograms.root output=run-root-2.out.$(Cluster).$(Process) error=run-root-2.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-5 ---------------------------------------------------- Create run-root-2.sh : #!/bin/bash root -b < run-root-2.C > root-2.out It has to be made executable, by use of the chmod Linux command: chmod +x run-root-2.sh Create run-root-2.C .L s0.C++ { //Load and run TSelector s0 *s = new s0(); TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Process(s); } We can test the root job on the local machine by executing the script to run root: ./run-root-2.sh We will not submit grid jobs during this exercise. So we will skip running condor. ----------------------------- Skip from here-6 ----------------------------------------- If this works, we can process the data files t00.root and t01.root on the Grid with our new command script run-root-2.cmd . This can be done with command: condor_submit run-root-2.cmd Once your job has finished, you again need to copy that file into your public directory, so that you can download it to your local desktop: cp histograms.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms.root ----------------------------------------------- Skip to here-6 ---------------------------------------------------- You can look at the output histogram file: histograms.root with TBrowser b as before, in your local terminal window.","title":"ATLAS Analysis Example"},{"location":"ASP2024/AnalysisExample/#atlas-analysis-example","text":"","title":"ATLAS Analysis Example"},{"location":"ASP2024/AnalysisExample/#introduction","text":"Root may be run in batch mode on the grid to analyze large data samples. This example creates simulated data in root format using trees and performs analysis on the simulated data by means of processing on the grid. This example is based on a demo developed by OU programmer Chris Walker.","title":"Introduction"},{"location":"ASP2024/AnalysisExample/#prerequisite","text":"Open a new Terminal on your local desktop. NOTE: You are no longer using the browser based terminal now, but the Terminal on your CentOS VM, just like you did to display mandle.gif with firefox. Make a directory for this exercise $ mkdir -p analysis_example $ cd analysis_example Again the $ sign at the beginning of the commands to execute is the command prompt , so it should not be entered as part of the command.","title":"Prerequisite"},{"location":"ASP2024/AnalysisExample/#simple-analysis-example","text":"","title":"Simple Analysis Example"},{"location":"ASP2024/AnalysisExample/#step-1-create-simulated-data-using-the-grid","text":"Note: Since the new training VMs on OSpool do not support running root, we will run root on the local desktops instead of using condor. So we will not need the condor submit scripts below but we will leave the instructions for them for future reference. Now in your test directory we will create the three files: run-root.cmd , run-root.sh , and run-root.C with the contents given below. This may require running an editor such as emacs or nano on your local desktop. We will not submit grid jobs so the \"run-root.cmd\" script is not needed for this exercise. ----------------------------- Skip from here-1 ----------------------------------------- First, we will utilize a simple command script to submit the grid jobs. It is run-root.cmd : universe=vanilla executable=run-root.sh transfer_input_files = run-root.C transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root.log transfer_output_files = root.out,t00.root,t01.root output=run-root.out.$(Cluster).$(Process) error=run-root.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-1 ---------------------------------------------------- Note that the executable script is: run-root.sh which is as follows: #!/bin/bash # setup source /cvmfs/sft.cern.ch/lcg/views/setupViews.sh LCG_105a x86_64-ubuntu2204-gcc11-opt # execute root -b < run-root.C > root.out This script runs Root in batch mode and executes input macro run-root.C and produces output that is routed to file root.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-root.sh The macro run-root.C consists of the following code: { // create files containing simulated data TRandom g; char c[256]; for ( int j = 0 ; j < 2 ; j++ ){ sprintf(c,\"t%2.2d.root\\000\",j); TFile f(c,\"RECREATE\",\"MyFile\", 0/*no compression*/); TTree *t = new TTree(\"t0\",\"t0\"); Int_t Run; TBranch * b_Run = t->Branch(\"Run\",&Run); Int_t Event; TBranch * b_Event = t->Branch(\"Event\",&Event); Float_t Energy; TBranch * b_Energy = t->Branch(\"Energy\",&Energy); Run = j; for( Event = 0 ; Event < 100 ; Event++ ){ Energy = g.Gaus(500.0 , 200.0); t->Fill(); } f.Write(); f.Close(); } } .q We will not submit grid jobs during this exercise. So we will skip to running root. ----------------------------- Skip from here-2 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-root.cmd It can be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-root.log , and output file: root.out , and the files containing the simulated data: t00.root , t01.root in your test directory. You need to copy these files into your public directory, so that you can download it to your local desktop: $ cp t0*.root ~/public/ Now open a different terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/t00.root http://stash.osgconnect.net/~YOUR_USER_ID/t01.root ----------------------------------------------- Skip to here-2 ---------------------------------------------------- Execute the script to run root: ./run-root.sh You can then inspect the contents of t00.root and t01.root by running root in your current directory in the local terminal window: $ root t00.root And then the Root command: TBrowser b With the TBrowser you can plot the simulated data in branch Energy as well as the other branches. Double click on the name of the root files, and then on the variables you would like to plot. Each data file contains a TTree named t0 . You can plot the contents of all (in this example both) data file TTree's by using the TChain method as follows: In root execute the following commands: TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Draw(\"Energy\"); When you are done with this, you can quit root again with the command .q <Return> .","title":"Step 1: Create simulated data using the grid"},{"location":"ASP2024/AnalysisExample/#step-2-analyze-real-data","text":"We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-3 ----------------------------------------- The grid job can be submitted using: Now we want to have a look at a real live ATLAS root file. For this, go back to the remote terminal window on osgconnect. You will need a new condor submit script called run-z.cmd : universe=vanilla executable=run-z.sh transfer_input_files = readEvents.C,muons.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-z.log transfer_output_files = root-z.out,histograms-z.root output=run-z.out.$(Cluster).$(Process) error=run-z.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-3 ---------------------------------------------------- The new executable script you need for this job is: run-z.sh which is as follows: #!/bin/bash # setup source /cvmfs/sft.cern.ch/lcg/views/setupViews.sh LCG_105a x86_64-ubuntu2204-gcc11-opt # execute root -b -q readEvents.C+ > root-z.out This script runs root in batch mode and executes input macro readEvents.C and produces output that is routed to file root-z.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-z.sh The macro readEvents.C consists of the following code: #include \"TFile.h\" #include \"TTree.h\" #include \"TCanvas.h\" #include \"TH1F.h\" #include \"iostream\" //#include \"TLorentzVector.h\" using namespace std; void readEvents(){ // load the ROOT ntuple file TFile * f = new TFile(\"muons.root\"); TTree *tree = (TTree *) f->Get(\"POOLCollectionTree\"); int nEntries = tree->GetEntries(); cout << \"There are \" << nEntries << \" entries in your ntuple\" << endl; // create local variables for the tree's branches UInt_t NLooseMuons; Float_t LooseMuonsEta1; Float_t LooseMuonsPhi1; Float_t LooseMuonsPt1; Float_t LooseMuonsEta2; Float_t LooseMuonsPhi2; Float_t LooseMuonsPt2; // set the tree's branches to the local variables tree->SetBranchAddress(\"NLooseMuon\", &NLooseMuons); tree->SetBranchAddress(\"LooseMuonEta1\", &LooseMuonsEta1); tree->SetBranchAddress(\"LooseMuonPhi1\", &LooseMuonsPhi1); tree->SetBranchAddress(\"LooseMuonPt1\", &LooseMuonsPt1); tree->SetBranchAddress(\"LooseMuonEta2\", &LooseMuonsEta2); tree->SetBranchAddress(\"LooseMuonPhi2\", &LooseMuonsPhi2); tree->SetBranchAddress(\"LooseMuonPt2\", &LooseMuonsPt2); // declare some histograms TH1F *muPt1 = new TH1F(\"muPt1\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx1 = new TH1F(\"muPx1\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy1 = new TH1F(\"muPy1\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz1 = new TH1F(\"muPz1\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta1 = new TH1F(\"muEta1\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi1 = new TH1F(\"muPhi1\", \";#phi;Events\", 50, -4, 4); TH1F *muE1 = new TH1F(\"muE1\", \";Energy;Events\", 50, 0, 200); TH1F *muPt2 = new TH1F(\"muPt2\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx2 = new TH1F(\"muPx2\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy2 = new TH1F(\"muPy2\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz2 = new TH1F(\"muPz2\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta2 = new TH1F(\"muEta2\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi2 = new TH1F(\"muPhi2\", \";#phi;Events\", 50, -4, 4); TH1F *muE2 = new TH1F(\"muE2\", \";Energy;Events\", 50, 0, 200); TH1F *zPt = new TH1F(\"zPt\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *zPx = new TH1F(\"zPx\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *zPy = new TH1F(\"zPy\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *zPz = new TH1F(\"zPz\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz //TH1F *zEta = new TH1F(\"zEta\", \";#eta;Events\", 50, -3, 3); //TH1F *zPhi = new TH1F(\"zPhi\", \";#phi;Events\", 50, -4, 4); TH1F *zE = new TH1F(\"zE\", \";Energy;Events\", 50, 0, 200); TH1F *zMass = new TH1F(\"zMass\", \";Mass;Events\", 50, 0, 200); // loop over each entry (event) in the tree for( int entry=0; entry < nEntries; entry++ ){ if( entry%10000 * 0 ) cout << \"Entry:\" << entry << endl; // check that the event is read properly int entryCheck = tree->GetEntry( entry ); if( entryCheck <= 0 ){ continue; } // only look at events containing at least 2 leptons if(NLooseMuons < 2) continue; // require the leptons to have some transverse momentum if(abs(LooseMuonsPt1) *0.001 < 20 || abs(LooseMuonsPt2) *0.001 < 20 ) continue; // make a LorentzVector from the muon //TLorentzVector Muons1; // Muons1.SetPtEtaPhiM(fabs(LooseMuonsPt1), LooseMuonsEta1, LooseMuonsPhi1, 0); // print out the details of an electron every so often if( entry%10000 * 0 ){ cout << \"Muons pt1: \" << LooseMuonsPt1 << \" eta: \" << LooseMuonsEta1 << \" phi \" << LooseMuonsPhi1 << endl; cout << \"Muons pt2: \" << LooseMuonsPt2 << \" eta: \" << LooseMuonsEta2 << \" phi \" << LooseMuonsPhi2 << endl; } //calculation of muon energy Double_t muonMass = 0.0; // assume the mass of the muon is negligible Double_t muonPx1 = abs(LooseMuonsPt1)*cos(LooseMuonsPhi1); Double_t muonPy1 = abs(LooseMuonsPt1)*sin(LooseMuonsPhi1); Double_t muonPz1 = abs(LooseMuonsPt1)*sinh(LooseMuonsEta1); Double_t muonEnergy1 = sqrt (muonPx1*muonPx1 + muonPy1*muonPy1 + muonPz1*muonPz1 + muonMass*muonMass); Double_t muonPx2 = abs(LooseMuonsPt2)*cos(LooseMuonsPhi2); Double_t muonPy2 = abs(LooseMuonsPt2)*sin(LooseMuonsPhi2); Double_t muonPz2 = abs(LooseMuonsPt2)*sinh(LooseMuonsEta2); Double_t muonEnergy2 = sqrt (muonPx2*muonPx2 + muonPy2*muonPy2 + muonPz2*muonPz2 + muonMass*muonMass); Double_t zCompX = muonPx1 + muonPx2; Double_t zCompY = muonPy1 + muonPy2; Double_t zLongi = muonPz1 + muonPz2; Double_t zPerp = sqrt (zCompX*zCompX + zCompY*zCompY); Double_t zEnergy = muonEnergy1 + muonEnergy2; Double_t zM = sqrt (zEnergy*zEnergy -zCompX*zCompX -zCompY*zCompY -zLongi*zLongi); // fill our histograms muPt1->Fill((LooseMuonsPt1)*0.001); // in GeV muEta1->Fill(LooseMuonsEta1); muPhi1->Fill(LooseMuonsPhi1); muPx1->Fill( muonPx1*0.001); // in GeV muPy1->Fill( muonPy1*0.001); // in GeV muPz1->Fill( muonPz1*0.001); // in GeV muE1->Fill(muonEnergy1*0.001); // in GeV muPt2->Fill((LooseMuonsPt2)*0.001); // in GeV muEta2->Fill(LooseMuonsEta2); muPhi2->Fill(LooseMuonsPhi2); muPx2->Fill( muonPx2*0.001); // in GeV muPy2->Fill( muonPy2*0.001); // in GeV muPz2->Fill( muonPz2*0.001); // in GeV muE2->Fill(muonEnergy2*0.001); // in GeV zPt->Fill( zPerp*0.001); // in GeV zPx->Fill( zCompX*0.001); // in GeV zPy->Fill( zCompY*0.001); // in GeV zPz->Fill( zLongi*0.001); // in GeV zE->Fill( zEnergy*0.001); // in GeV zMass->Fill(zM*0.001); // in GeV } // draw the eta distribution zMass->Draw(); // make a ROOT output file to store your histograms TFile *outFile = new TFile(\"histograms-z.root\", \"recreate\"); muPt1->Write(); muEta1->Write(); muPhi1->Write(); muE1->Write(); muPx1->Write(); muPy1->Write(); muPz1->Write(); muPt2->Write(); muEta2->Write(); muPhi2->Write(); muE2->Write(); muPx2->Write(); muPy2->Write(); muPz2->Write(); zPt->Write(); zE->Write(); zPx->Write(); zPy->Write(); zPz->Write(); zMass->Write(); outFile->Close(); } We will not submit grid jobs during this exercise. So we will skip to run root. ----------------------------- Skip from here-4 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-z.cmd It can again be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-z.log , and output file: root-z.out , and the files containing the simulated data: histograms-z.root in your test directory. You again need to copy that file into your public directory, so that you can download it to your local desktop: $ cp histograms-z.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms-z.root ----------------------------------------------- Skip to here-4 ---------------------------------------------------- Setup a soft link to the input data file, muons.root, and execute the script to run root: ln -s /opt/data/muons.root . ./run-z.sh You can inspect the contents of histograms-z.root by running Root (i.e., root histograms-z.root ) in your current directory in your local terminal window: $ root histograms-z.root And then using the Root command: TBrowser b With the TBrowser you can plot the variables in the root file. Double click on histograms-z.root , and then on the variables to plot them.","title":"Step 2: Analyze Real Data"},{"location":"ASP2024/AnalysisExample/#step-3-make-tselector","text":"Now let's go back to the files created in step 1, in the local terminal window. Start root in your test directory with the following commands: $ root -b And then execute the following commands: TFile f(\"t00.root\"); t0->MakeSelector(\"s0\",\"=legacy\"); f.Close(); .q This will create files s0.C and s0.h in your test directory that contain code corresponding to the definition of the TTree t0 . This code can be used to process files containing data in these TTree's. Now we will add a histogram to the TSelector code. Several code lines have to be added to the TSelector code files s0.C and s0.h . To s0.h make the following additions: after existing include statements add: #include <TH1F.h> After class s0 definition: class s0 : public TSelector { public : add TH1F *e; To s0.C make the following additions: After entry: void s0::SlaveBegin(TTree * /*tree*/) { add e = new TH1F(\"e\", \"e\", 1000, -199.0, 1200.0); After Process entry: Bool_t s0::Process(Long64_t entry) { add GetEntry(entry); e->Fill(Energy); After terminate entry: void s0::Terminate() { add TFile f(\"histograms.root\",\"RECREATE\"); f.WriteObject(e,\"Energy\"); f.Close(); We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-5 ----------------------------------------- Now create the new script files for Step 2: create run-root-2.cmd : universe=vanilla executable=run-root-2.sh transfer_input_files = s0.C,s0.h,run-root-2.C,t00.root,t01.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root-2.log transfer_output_files = root-2.out,histograms.root output=run-root-2.out.$(Cluster).$(Process) error=run-root-2.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-5 ---------------------------------------------------- Create run-root-2.sh : #!/bin/bash root -b < run-root-2.C > root-2.out It has to be made executable, by use of the chmod Linux command: chmod +x run-root-2.sh Create run-root-2.C .L s0.C++ { //Load and run TSelector s0 *s = new s0(); TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Process(s); } We can test the root job on the local machine by executing the script to run root: ./run-root-2.sh We will not submit grid jobs during this exercise. So we will skip running condor. ----------------------------- Skip from here-6 ----------------------------------------- If this works, we can process the data files t00.root and t01.root on the Grid with our new command script run-root-2.cmd . This can be done with command: condor_submit run-root-2.cmd Once your job has finished, you again need to copy that file into your public directory, so that you can download it to your local desktop: cp histograms.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms.root ----------------------------------------------- Skip to here-6 ---------------------------------------------------- You can look at the output histogram file: histograms.root with TBrowser b as before, in your local terminal window.","title":"Step 3: Make TSelector"},{"location":"ASP2024/PH_DT2017/","text":"Polar Hackathon 2017 High-Level Cirriculum Overview Wednesday Session 1 Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Wednesday Session 2 Workflows and distributed environments Thursday Sessions Containers Jetstream demo and discussion Materials Materials Page","title":"Polar Hackathon 2017"},{"location":"ASP2024/PH_DT2017/#polar-hackathon-2017","text":"","title":"Polar Hackathon 2017"},{"location":"ASP2024/PH_DT2017/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"ASP2024/PH_DT2017/#wednesday-session-1","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Wednesday Session 1"},{"location":"ASP2024/PH_DT2017/#wednesday-session-2","text":"Workflows and distributed environments","title":"Wednesday Session 2"},{"location":"ASP2024/PH_DT2017/#thursday-sessions","text":"Containers Jetstream demo and discussion","title":"Thursday Sessions"},{"location":"ASP2024/PH_DT2017/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2024/PH_Materials/","text":"Polar Hackathon School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Wednesday - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Wednesday - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday - Computational Infrastructures - Session 3 Lecture 4 - Slides Containers with Singularity Jetstream Demo Possible if Time - Workflows Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow Extras Discipline specific tutorials Contact information Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Polar Hackathon School Materials"},{"location":"ASP2024/PH_Materials/#polar-hackathon-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Polar Hackathon School Materials"},{"location":"ASP2024/PH_Materials/#wednesday-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Wednesday - Computational Infrastructures - Session 1"},{"location":"ASP2024/PH_Materials/#wednesday-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Wednesday - Computational Infrastructures - Session 2"},{"location":"ASP2024/PH_Materials/#thursday-computational-infrastructures-session-3","text":"Lecture 4 - Slides Containers with Singularity Jetstream Demo","title":"Thursday - Computational Infrastructures - Session 3"},{"location":"ASP2024/PH_Materials/#possible-if-time-workflows","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow","title":"Possible if Time - Workflows"},{"location":"ASP2024/PH_Materials/#extras","text":"Discipline specific tutorials","title":"Extras"},{"location":"ASP2024/PH_Materials/#contact-information","text":"Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"ASP2024/School/","text":"Data Trieste 2017 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Friday AM Session Introduction to cloud environemts Close out and resources for further collaboration Detailed Schedule Thursday 20-July 2017 Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick Friday 21-July 2017 Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch Materials Materials Page","title":"Data Trieste 2017"},{"location":"ASP2024/School/#data-trieste-2017","text":"","title":"Data Trieste 2017"},{"location":"ASP2024/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"ASP2024/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"ASP2024/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Thursday PM Session"},{"location":"ASP2024/School/#friday-am-session","text":"Introduction to cloud environemts Close out and resources for further collaboration","title":"Friday AM Session"},{"location":"ASP2024/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"ASP2024/School/#thursday-20-july-2017","text":"Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick","title":"Thursday 20-July 2017"},{"location":"ASP2024/School/#friday-21-july-2017","text":"Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch","title":"Friday 21-July 2017"},{"location":"ASP2024/School/#materials","text":"Materials Page","title":"Materials"},{"location":"DataKigali2018/","text":"Placeholder.","title":"Index"},{"location":"DataKigali2018/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.12 manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes. Where you will work Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"High Throughput Computing and Condor Introduction"},{"location":"DataKigali2018/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"DataKigali2018/01-Introduction/#preliminaries","text":"You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.12 manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"DataKigali2018/01-Introduction/#which-condor","text":"We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes.","title":"Which Condor?"},{"location":"DataKigali2018/01-Introduction/#where-you-will-work","text":"Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"DataKigali2018/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"The Exercises"},{"location":"DataKigali2018/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob. Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.10 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.10 (Carbon) Release: 6.10 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"DataKigali2018/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"DataKigali2018/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"DataKigali2018/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob.","title":"Login to the Condor submit computer"},{"location":"DataKigali2018/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux.","title":"Looking at our Condor installation"},{"location":"DataKigali2018/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.10 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.10 (Carbon) Release: 6.10 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"DataKigali2018/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"DataKigali2018/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"DataKigali2018/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"DataKigali2018/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"DataKigali2018/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"DataKigali2018/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"DataKigali2018/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"DataKigali2018/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"DataKigali2018/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"DataKigali2018/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"DataKigali2018/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"DataKigali2018/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"DataKigali2018/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"DataKigali2018/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"DataKigali2018/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"DataKigali2018/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"DataKigali2018/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"DataKigali2018/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"DataKigali2018/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"DataKigali2018/04-TipsandTricks/#tips-for-condor_q","text":"condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"DataKigali2018/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"DataKigali2018/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"DataKigali2018/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"DataKigali2018/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"DataKigali2018/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"DataKigali2018/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"DataKigali2018/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"Running a job with R"},{"location":"DataKigali2018/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"DataKigali2018/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"DataKigali2018/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"DataKigali2018/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"DataKigali2018/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"DataKigali2018/06-RJob/#setup","text":"OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"DataKigali2018/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"DataKigali2018/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"The answer"},{"location":"DataKigali2018/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"DataKigali2018/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"DataKigali2018/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"DataKigali2018/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"DataKigali2018/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"DataKigali2018/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot , A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif You will see what looks like an error, but does not affect the image stitching. You can ignore this. montage: unable to read font `(null)' @ error/annotate.c/RenderFreetype/1339. This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above, or more simply by moving the file to a web accessible location. cp mandle.gif ~/public point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"A brief detour through the Mandlebrot set"},{"location":"DataKigali2018/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot ,","title":"A brief detour through the Mandlebrot set"},{"location":"DataKigali2018/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"DataKigali2018/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif You will see what looks like an error, but does not affect the image stitching. You can ignore this. montage: unable to read font `(null)' @ error/annotate.c/RenderFreetype/1339. This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"DataKigali2018/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above, or more simply by moving the file to a web accessible location. cp mandle.gif ~/public point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"Try it!"},{"location":"DataKigali2018/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataKigali2018/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataKigali2018/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"DataKigali2018/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"DataKigali2018/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"DataKigali2018/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"DataKigali2018/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot goatbrot1.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue wrapper_montage.sh Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"DataKigali2018/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"DataKigali2018/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"DataKigali2018/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot","title":"Make your job submission files"},{"location":"DataKigali2018/10-ComplexDAG/#goatbrot1sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"DataKigali2018/10-ComplexDAG/#goatbrot2sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"DataKigali2018/10-ComplexDAG/#goatbrot3sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"DataKigali2018/10-ComplexDAG/#goatbrot4sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"DataKigali2018/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"montage.sub"},{"location":"DataKigali2018/10-ComplexDAG/#wrapper_montagesh","text":"Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif","title":"wrapper_montage.sh"},{"location":"DataKigali2018/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"DataKigali2018/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"DataKigali2018/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"DataKigali2018/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"DataKigali2018/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up. Challenge If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Handling a DAG that fails"},{"location":"DataKigali2018/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"DataKigali2018/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"DataKigali2018/11-HandlingFailure/#challenge","text":"If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Challenge"},{"location":"DataKigali2018/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"DataKigali2018/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"DataKigali2018/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"DataKigali2018/12-VariableSubstitution/#declare-your-variables","text":"First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"DataKigali2018/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"DataKigali2018/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"DataKigali2018/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"DataKigali2018/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"DataKigali2018/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"DataKigali2018/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"DataKigali2018/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"DataKigali2018/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"DataKigali2018/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"DataKigali2018/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"DataKigali2018/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"DataKigali2018/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"DataKigali2018/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"DataKigali2018/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"DataKigali2018/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"DataKigali2018/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"DataKigali2018/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"DataKigali2018/Materials/","text":"Data Kigali School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Thursday Morning - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Thursday Morning - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday Afternoon - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Thursday Aftenoon - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Friday Morning - Computational Infrastructures - Session 5 Lecture 5 - Persistent Identifiers, Digital Objects, an Architecture for a Data Centeric Internet Friday Morning - Computational Infrastructures - Session 6 Getting Involved with CODATA, RDA, and the Foundational Schools of Research Data Science Contact information Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Data Kigali 2018 Materials"},{"location":"DataKigali2018/Materials/#data-kigali-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Data Kigali School Materials"},{"location":"DataKigali2018/Materials/#thursday-morning-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Thursday Morning - Computational Infrastructures - Session 1"},{"location":"DataKigali2018/Materials/#thursday-morning-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Thursday Morning - Computational Infrastructures - Session 2"},{"location":"DataKigali2018/Materials/#thursday-afternoon-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Thursday Afternoon - Computational Infrastructures - Session 3"},{"location":"DataKigali2018/Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"DataKigali2018/Materials/#thursday-aftenoon-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Thursday Aftenoon - Computational Infrastructures - Session 4"},{"location":"DataKigali2018/Materials/#friday-morning-computational-infrastructures-session-5","text":"Lecture 5 - Persistent Identifiers, Digital Objects, an Architecture for a Data Centeric Internet","title":"Friday Morning - Computational Infrastructures - Session 5"},{"location":"DataKigali2018/Materials/#friday-morning-computational-infrastructures-session-6","text":"Getting Involved with CODATA, RDA, and the Foundational Schools of Research Data Science","title":"Friday Morning - Computational Infrastructures - Session 6"},{"location":"DataKigali2018/Materials/#contact-information","text":"Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"DataKigali2018/School/","text":"Data Kigali 2018 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Friday AM Session A few words on clouds and containers Close out and resources for further collaboration Disclipline spefic tutorials Detailed Schedule Thursday 16-August-2018 Time Discription Instructor 09:30 Welcome and the Landscape of Research Computing Rob Quick 10:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 11:00 Coffee Break 11:30 Profiling your application and finding a home for your workflow Rob Quick 12:00 Exercise - Single and batch submittion with HTCondor Rob Quick 12:30 Lunch 13:30 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:30 Coffee Break 16:00 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 17:15 Exercise - DAGMAN Rob Quick Friday 17-August-2018 Time Discription Instructor 09:30 Clouds, Containers and Parting Words Rob Quick 10:00 Exercise - Complete Earlier Lessons Rob Quick 11:00 Coffee Break 11:30 Becoming Involved and Closing Thoughts Rob Quick 13:30 Lunch Materials Materials Page","title":"Data Kigali 2018 Schedule"},{"location":"DataKigali2018/School/#data-kigali-2018","text":"","title":"Data Kigali 2018"},{"location":"DataKigali2018/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"DataKigali2018/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"DataKigali2018/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Thursday PM Session"},{"location":"DataKigali2018/School/#friday-am-session","text":"A few words on clouds and containers Close out and resources for further collaboration Disclipline spefic tutorials","title":"Friday AM Session"},{"location":"DataKigali2018/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"DataKigali2018/School/#thursday-16-august-2018","text":"Time Discription Instructor 09:30 Welcome and the Landscape of Research Computing Rob Quick 10:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 11:00 Coffee Break 11:30 Profiling your application and finding a home for your workflow Rob Quick 12:00 Exercise - Single and batch submittion with HTCondor Rob Quick 12:30 Lunch 13:30 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:30 Coffee Break 16:00 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 17:15 Exercise - DAGMAN Rob Quick","title":"Thursday 16-August-2018"},{"location":"DataKigali2018/School/#friday-17-august-2018","text":"Time Discription Instructor 09:30 Clouds, Containers and Parting Words Rob Quick 10:00 Exercise - Complete Earlier Lessons Rob Quick 11:00 Coffee Break 11:30 Becoming Involved and Closing Thoughts Rob Quick 13:30 Lunch","title":"Friday 17-August-2018"},{"location":"DataKigali2018/School/#materials","text":"Materials Page","title":"Materials"},{"location":"DataSaoPaulo2018/","text":"Placeholder.","title":"Index"},{"location":"DataSaoPaulo2018/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.12 manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes. Where you will work Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"High Throughput Computing and Condor Introduction"},{"location":"DataSaoPaulo2018/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"DataSaoPaulo2018/01-Introduction/#preliminaries","text":"You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.12 manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"DataSaoPaulo2018/01-Introduction/#which-condor","text":"We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes.","title":"Which Condor?"},{"location":"DataSaoPaulo2018/01-Introduction/#where-you-will-work","text":"Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"DataSaoPaulo2018/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"The Exercises"},{"location":"DataSaoPaulo2018/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob. Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.10 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.10 (Carbon) Release: 6.10 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"DataSaoPaulo2018/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"DataSaoPaulo2018/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"DataSaoPaulo2018/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob.","title":"Login to the Condor submit computer"},{"location":"DataSaoPaulo2018/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux.","title":"Looking at our Condor installation"},{"location":"DataSaoPaulo2018/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.10 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.10 (Carbon) Release: 6.10 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"DataSaoPaulo2018/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"DataSaoPaulo2018/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"DataSaoPaulo2018/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"DataSaoPaulo2018/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"DataSaoPaulo2018/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"DataSaoPaulo2018/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"DataSaoPaulo2018/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"DataSaoPaulo2018/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"DataSaoPaulo2018/04-TipsandTricks/#tips-for-condor_q","text":"condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"DataSaoPaulo2018/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"DataSaoPaulo2018/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"DataSaoPaulo2018/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"DataSaoPaulo2018/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"DataSaoPaulo2018/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"DataSaoPaulo2018/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"DataSaoPaulo2018/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"Running a job with R"},{"location":"DataSaoPaulo2018/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"DataSaoPaulo2018/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"DataSaoPaulo2018/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"DataSaoPaulo2018/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"DataSaoPaulo2018/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"DataSaoPaulo2018/06-RJob/#setup","text":"OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"DataSaoPaulo2018/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"DataSaoPaulo2018/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"The answer"},{"location":"DataSaoPaulo2018/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"DataSaoPaulo2018/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"DataSaoPaulo2018/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"DataSaoPaulo2018/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"DataSaoPaulo2018/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"DataSaoPaulo2018/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot , A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif You will see what looks like an error, but does not affect the image stitching. You can ignore this. montage: unable to read font `(null)' @ error/annotate.c/RenderFreetype/1339. This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above, or more simply by moving the file to a web accessible location. cp mandle.gif ~/public point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"A brief detour through the Mandlebrot set"},{"location":"DataSaoPaulo2018/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot ,","title":"A brief detour through the Mandlebrot set"},{"location":"DataSaoPaulo2018/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"DataSaoPaulo2018/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif You will see what looks like an error, but does not affect the image stitching. You can ignore this. montage: unable to read font `(null)' @ error/annotate.c/RenderFreetype/1339. This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"DataSaoPaulo2018/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above, or more simply by moving the file to a web accessible location. cp mandle.gif ~/public point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"Try it!"},{"location":"DataSaoPaulo2018/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataSaoPaulo2018/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataSaoPaulo2018/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"DataSaoPaulo2018/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"DataSaoPaulo2018/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"DataSaoPaulo2018/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"DataSaoPaulo2018/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot goatbrot1.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue wrapper_montage.sh Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot","title":"Make your job submission files"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#goatbrot1sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#goatbrot2sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#goatbrot3sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#goatbrot4sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"montage.sub"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#wrapper_montagesh","text":"Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif","title":"wrapper_montage.sh"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"DataSaoPaulo2018/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up. Challenge If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Handling a DAG that fails"},{"location":"DataSaoPaulo2018/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"DataSaoPaulo2018/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"DataSaoPaulo2018/11-HandlingFailure/#challenge","text":"If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Challenge"},{"location":"DataSaoPaulo2018/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"DataSaoPaulo2018/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"DataSaoPaulo2018/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"DataSaoPaulo2018/12-VariableSubstitution/#declare-your-variables","text":"First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"DataSaoPaulo2018/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"DataSaoPaulo2018/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"DataSaoPaulo2018/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"DataSaoPaulo2018/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"DataSaoPaulo2018/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"DataSaoPaulo2018/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"DataSaoPaulo2018/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"DataSaoPaulo2018/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"DataSaoPaulo2018/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"DataSaoPaulo2018/Materials/","text":"Data Sao Paulo School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Thursday Morning - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Thursday Morning - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday Afternoon - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Thursday Aftenoon - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Friday Morning - Computational Infrastructures - Session 5 Lecture 5 - Persistent Identifiers, Digital Objects, an Architecture for a Data Centeric Internet Friday Morning - Computational Infrastructures - Session 6 Close Out - What to do next? Contact information Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended #DataSaoPaulo. If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Data Sao Paulo 2018 Materials"},{"location":"DataSaoPaulo2018/Materials/#data-sao-paulo-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Data Sao Paulo School Materials"},{"location":"DataSaoPaulo2018/Materials/#thursday-morning-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Thursday Morning - Computational Infrastructures - Session 1"},{"location":"DataSaoPaulo2018/Materials/#thursday-morning-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Thursday Morning - Computational Infrastructures - Session 2"},{"location":"DataSaoPaulo2018/Materials/#thursday-afternoon-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Thursday Afternoon - Computational Infrastructures - Session 3"},{"location":"DataSaoPaulo2018/Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"DataSaoPaulo2018/Materials/#thursday-aftenoon-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Thursday Aftenoon - Computational Infrastructures - Session 4"},{"location":"DataSaoPaulo2018/Materials/#friday-morning-computational-infrastructures-session-5","text":"Lecture 5 - Persistent Identifiers, Digital Objects, an Architecture for a Data Centeric Internet","title":"Friday Morning - Computational Infrastructures - Session 5"},{"location":"DataSaoPaulo2018/Materials/#friday-morning-computational-infrastructures-session-6","text":"Close Out - What to do next?","title":"Friday Morning - Computational Infrastructures - Session 6"},{"location":"DataSaoPaulo2018/Materials/#contact-information","text":"Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended #DataSaoPaulo. If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"DataSaoPaulo2018/School/","text":"Data Sao Paulo 2018 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Friday AM Session A few words on clouds and containers Close out and resources for further collaboration Disclipline spefic tutorials Detailed Schedule Thursday 13-December-2018 Time Discription Instructor 09:30 Welcome and the Landscape of Research Computing Rob Quick 10:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 11:00 Coffee Break 11:30 Profiling your application and finding a home for your workflow Rob Quick 12:00 Exercise - Single and batch submittion with HTCondor Rob Quick 12:30 Lunch 13:30 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:30 Coffee Break 16:00 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 17:15 Exercise - DAGMAN Rob Quick Friday 14-December-2018 Time Discription Instructor 09:30 Clouds, Containers and Parting Words Rob Quick 10:00 Exercise - Complete Earlier Lessons Rob Quick 11:00 Coffee Break 11:30 Becoming Involved and Closing Thoughts Rob Quick 13:30 Lunch Materials Materials Page","title":"Data Sao Paulo 2018 Schedule"},{"location":"DataSaoPaulo2018/School/#data-sao-paulo-2018","text":"","title":"Data Sao Paulo 2018"},{"location":"DataSaoPaulo2018/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"DataSaoPaulo2018/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"DataSaoPaulo2018/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Thursday PM Session"},{"location":"DataSaoPaulo2018/School/#friday-am-session","text":"A few words on clouds and containers Close out and resources for further collaboration Disclipline spefic tutorials","title":"Friday AM Session"},{"location":"DataSaoPaulo2018/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"DataSaoPaulo2018/School/#thursday-13-december-2018","text":"Time Discription Instructor 09:30 Welcome and the Landscape of Research Computing Rob Quick 10:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 11:00 Coffee Break 11:30 Profiling your application and finding a home for your workflow Rob Quick 12:00 Exercise - Single and batch submittion with HTCondor Rob Quick 12:30 Lunch 13:30 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:30 Coffee Break 16:00 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 17:15 Exercise - DAGMAN Rob Quick","title":"Thursday 13-December-2018"},{"location":"DataSaoPaulo2018/School/#friday-14-december-2018","text":"Time Discription Instructor 09:30 Clouds, Containers and Parting Words Rob Quick 10:00 Exercise - Complete Earlier Lessons Rob Quick 11:00 Coffee Break 11:30 Becoming Involved and Closing Thoughts Rob Quick 13:30 Lunch","title":"Friday 14-December-2018"},{"location":"DataSaoPaulo2018/School/#materials","text":"Materials Page","title":"Materials"},{"location":"DataTrieste2018/","text":"Placeholder.","title":"Index"},{"location":"DataTrieste2018/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.12 manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes. Where you will work Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"High Throughput Computing and Condor Introduction"},{"location":"DataTrieste2018/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"DataTrieste2018/01-Introduction/#preliminaries","text":"You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.12 manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"DataTrieste2018/01-Introduction/#which-condor","text":"We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes.","title":"Which Condor?"},{"location":"DataTrieste2018/01-Introduction/#where-you-will-work","text":"Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"DataTrieste2018/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"The Exercises"},{"location":"DataTrieste2018/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob. Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.10 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.10 (Carbon) Release: 6.10 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"DataTrieste2018/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"DataTrieste2018/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"DataTrieste2018/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob.","title":"Login to the Condor submit computer"},{"location":"DataTrieste2018/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux.","title":"Looking at our Condor installation"},{"location":"DataTrieste2018/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.10 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.10 (Carbon) Release: 6.10 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"DataTrieste2018/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"DataTrieste2018/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"DataTrieste2018/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"DataTrieste2018/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"DataTrieste2018/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"DataTrieste2018/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"DataTrieste2018/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"DataTrieste2018/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"DataTrieste2018/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"DataTrieste2018/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"DataTrieste2018/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"DataTrieste2018/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"DataTrieste2018/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"DataTrieste2018/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"DataTrieste2018/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"DataTrieste2018/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"DataTrieste2018/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 28.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 28.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 28.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 28.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 28.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 28.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can also show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"DataTrieste2018/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"DataTrieste2018/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"DataTrieste2018/04-TipsandTricks/#tips-for-condor_q","text":"Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 28.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 28.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 28.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 28.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 28.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 28.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can also show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"DataTrieste2018/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"DataTrieste2018/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"DataTrieste2018/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"DataTrieste2018/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"DataTrieste2018/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"DataTrieste2018/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"DataTrieste2018/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"Running a job with R"},{"location":"DataTrieste2018/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"DataTrieste2018/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"DataTrieste2018/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"DataTrieste2018/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"DataTrieste2018/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"DataTrieste2018/06-RJob/#setup","text":"OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"DataTrieste2018/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"DataTrieste2018/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"The answer"},{"location":"DataTrieste2018/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"DataTrieste2018/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"DataTrieste2018/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"DataTrieste2018/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"DataTrieste2018/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"DataTrieste2018/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot , A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif You will see what looks like an error, but does not affect the image stitching. You can ignore this. montage: unable to read font `(null)' @ error/annotate.c/RenderFreetype/1339. This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Move file to local machine for viewing cp mandle.gif ~/public 1. This time, rather than copying your file to your local system, point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"A brief detour through the Mandlebrot set"},{"location":"DataTrieste2018/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot ,","title":"A brief detour through the Mandlebrot set"},{"location":"DataTrieste2018/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"DataTrieste2018/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif You will see what looks like an error, but does not affect the image stitching. You can ignore this. montage: unable to read font `(null)' @ error/annotate.c/RenderFreetype/1339. This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"DataTrieste2018/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Move file to local machine for viewing cp mandle.gif ~/public 1. This time, rather than copying your file to your local system, point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"Try it!"},{"location":"DataTrieste2018/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here? Challenge What is the scheduler universe? Why does DAGMan use it?","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataTrieste2018/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataTrieste2018/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"DataTrieste2018/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"DataTrieste2018/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"DataTrieste2018/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"DataTrieste2018/09-SimpleDAG/#challenge","text":"What is the scheduler universe? Why does DAGMan use it?","title":"Challenge"},{"location":"DataTrieste2018/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot goatbrot1.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue wrapper_montage.sh Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"DataTrieste2018/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"DataTrieste2018/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"DataTrieste2018/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot","title":"Make your job submission files"},{"location":"DataTrieste2018/10-ComplexDAG/#goatbrot1sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"DataTrieste2018/10-ComplexDAG/#goatbrot2sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"DataTrieste2018/10-ComplexDAG/#goatbrot3sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"DataTrieste2018/10-ComplexDAG/#goatbrot4sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"DataTrieste2018/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"montage.sub"},{"location":"DataTrieste2018/10-ComplexDAG/#wrapper_montagesh","text":"Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif","title":"wrapper_montage.sh"},{"location":"DataTrieste2018/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"DataTrieste2018/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"DataTrieste2018/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"DataTrieste2018/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"DataTrieste2018/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up. Challenge If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Handling a DAG that fails"},{"location":"DataTrieste2018/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"DataTrieste2018/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"DataTrieste2018/11-HandlingFailure/#challenge","text":"If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Challenge"},{"location":"DataTrieste2018/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"DataTrieste2018/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"DataTrieste2018/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"DataTrieste2018/12-VariableSubstitution/#declare-your-variables","text":"First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"DataTrieste2018/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"DataTrieste2018/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"DataTrieste2018/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"DataTrieste2018/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"DataTrieste2018/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"DataTrieste2018/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"DataTrieste2018/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"DataTrieste2018/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"DataTrieste2018/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"DataTrieste2018/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"DataTrieste2018/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"DataTrieste2018/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"DataTrieste2018/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"DataTrieste2018/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"DataTrieste2018/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"DataTrieste2018/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"DataTrieste2018/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"DataTrieste2018/Materials/","text":"Data Trieste School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Thursday Morning - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Thursday Morning - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday Afternoon - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Thursday Aftenoon - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Friday Morning - Computational Infrastructures - Session 5 Introduction and cloud computing (presentation) (30\u2019) Introduction to EGI and the EGI cloud infrastructure (30\u2019) Demo & exercise: Explore EGI services, Explore AppDB (30\u2019) The cloud-based EGI Notebooks service (presentation) (30\u2019) Training materials are available at: https://documents.egi.eu/document/3349 Friday Morning - Computational Infrastructures - Session 6 Intro to hands-on exercise 1 (10\u2019) Hands-on exercise 1 \u2013 Download and plot temperature data Intro to hands-on exercise 2 (10\u2019) Hands-on exercise 2 \u2013 Add rainfall data The future of compute infrastructures in Europe: EOSC (30\u2019) Next steps to become a user (15\u2019) Training materials are available at: https://documents.egi.eu/document/3349 Contact information Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Data Trieste 2018 Materials"},{"location":"DataTrieste2018/Materials/#data-trieste-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Data Trieste School Materials"},{"location":"DataTrieste2018/Materials/#thursday-morning-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Thursday Morning - Computational Infrastructures - Session 1"},{"location":"DataTrieste2018/Materials/#thursday-morning-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Thursday Morning - Computational Infrastructures - Session 2"},{"location":"DataTrieste2018/Materials/#thursday-afternoon-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Thursday Afternoon - Computational Infrastructures - Session 3"},{"location":"DataTrieste2018/Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"DataTrieste2018/Materials/#thursday-aftenoon-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Thursday Aftenoon - Computational Infrastructures - Session 4"},{"location":"DataTrieste2018/Materials/#friday-morning-computational-infrastructures-session-5","text":"Introduction and cloud computing (presentation) (30\u2019) Introduction to EGI and the EGI cloud infrastructure (30\u2019) Demo & exercise: Explore EGI services, Explore AppDB (30\u2019) The cloud-based EGI Notebooks service (presentation) (30\u2019) Training materials are available at: https://documents.egi.eu/document/3349","title":"Friday Morning - Computational Infrastructures - Session 5"},{"location":"DataTrieste2018/Materials/#friday-morning-computational-infrastructures-session-6","text":"Intro to hands-on exercise 1 (10\u2019) Hands-on exercise 1 \u2013 Download and plot temperature data Intro to hands-on exercise 2 (10\u2019) Hands-on exercise 2 \u2013 Add rainfall data The future of compute infrastructures in Europe: EOSC (30\u2019) Next steps to become a user (15\u2019) Training materials are available at: https://documents.egi.eu/document/3349","title":"Friday Morning - Computational Infrastructures - Session 6"},{"location":"DataTrieste2018/Materials/#contact-information","text":"Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"DataTrieste2018/School/","text":"Data Trieste 2018 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Friday AM Session Introduction to cloud environemts Close out and resources for further collaboration Disclipline spefic tutorials Detailed Schedule Thursday 16-August-2018 Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick Friday 17-August-2018 Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos, Guiseppe La Rocca 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos, Guiseppe La Rocca 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos, Guiseppe La Rocca 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos, Guiseppe La Rocca 13:00 Closing Thoughts Rob Quick 13:30 Lunch Materials Materials Page","title":"Data Trieste 2018 Schedule"},{"location":"DataTrieste2018/School/#data-trieste-2018","text":"","title":"Data Trieste 2018"},{"location":"DataTrieste2018/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"DataTrieste2018/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"DataTrieste2018/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Thursday PM Session"},{"location":"DataTrieste2018/School/#friday-am-session","text":"Introduction to cloud environemts Close out and resources for further collaboration Disclipline spefic tutorials","title":"Friday AM Session"},{"location":"DataTrieste2018/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"DataTrieste2018/School/#thursday-16-august-2018","text":"Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick","title":"Thursday 16-August-2018"},{"location":"DataTrieste2018/School/#friday-17-august-2018","text":"Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos, Guiseppe La Rocca 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos, Guiseppe La Rocca 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos, Guiseppe La Rocca 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos, Guiseppe La Rocca 13:00 Closing Thoughts Rob Quick 13:30 Lunch","title":"Friday 17-August-2018"},{"location":"DataTrieste2018/School/#materials","text":"Materials Page","title":"Materials"},{"location":"DataTrieste2019/","text":"Placeholder","title":"Index"},{"location":"DataTrieste2019/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.13 manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 8.6.13, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes. Where you will work Today you will log into training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"High Throughput Computing and Condor Introduction"},{"location":"DataTrieste2019/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"DataTrieste2019/01-Introduction/#preliminaries","text":"You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.13 manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"DataTrieste2019/01-Introduction/#which-condor","text":"We will be using Condor 8.6.13, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes.","title":"Which Condor?"},{"location":"DataTrieste2019/01-Introduction/#where-you-will-work","text":"Today you will log into training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"DataTrieste2019/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"The Exercises"},{"location":"DataTrieste2019/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into user-training.osgconnect.net $ hostname training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob. Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.13 Jan 16 2019 $ $CondorPlatform: X86_64-CentOS_7.6 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/os-release to find out: $ cat /etc/os-release Or you can run: $ hostnamectl Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"DataTrieste2019/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"DataTrieste2019/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"DataTrieste2019/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into user-training.osgconnect.net $ hostname training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob.","title":"Login to the Condor submit computer"},{"location":"DataTrieste2019/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.13 Jan 16 2019 $ $CondorPlatform: X86_64-CentOS_7.6 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux.","title":"Looking at our Condor installation"},{"location":"DataTrieste2019/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/os-release to find out: $ cat /etc/os-release Or you can run: $ hostnamectl Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"DataTrieste2019/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"DataTrieste2019/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"DataTrieste2019/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"DataTrieste2019/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"DataTrieste2019/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"DataTrieste2019/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. Create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"DataTrieste2019/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"DataTrieste2019/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"DataTrieste2019/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. Create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"DataTrieste2019/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"DataTrieste2019/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"DataTrieste2019/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"DataTrieste2019/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"DataTrieste2019/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"DataTrieste2019/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"DataTrieste2019/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"DataTrieste2019/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"DataTrieste2019/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"DataTrieste2019/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"DataTrieste2019/04-TipsandTricks/#tips-for-condor_q","text":"condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"DataTrieste2019/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"DataTrieste2019/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"DataTrieste2019/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 This script will not be executable without changing the permissions. $ chmod 755 simple.sh Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"DataTrieste2019/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"DataTrieste2019/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 This script will not be executable without changing the permissions. $ chmod 755 simple.sh Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"DataTrieste2019/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"DataTrieste2019/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"Running a job with R"},{"location":"DataTrieste2019/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"DataTrieste2019/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"DataTrieste2019/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"DataTrieste2019/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"DataTrieste2019/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"DataTrieste2019/06-RJob/#setup","text":"OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"DataTrieste2019/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"DataTrieste2019/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"The answer"},{"location":"DataTrieste2019/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"DataTrieste2019/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"DataTrieste2019/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"DataTrieste2019/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"DataTrieste2019/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"DataTrieste2019/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot , A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above, or more simply by moving the file to a web accessible location. cp mandle.gif ~/public point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"A brief detour through the Mandlebrot set"},{"location":"DataTrieste2019/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot ,","title":"A brief detour through the Mandlebrot set"},{"location":"DataTrieste2019/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"DataTrieste2019/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"DataTrieste2019/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above, or more simply by moving the file to a web accessible location. cp mandle.gif ~/public point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"Try it!"},{"location":"DataTrieste2019/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataTrieste2019/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataTrieste2019/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"DataTrieste2019/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"DataTrieste2019/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"DataTrieste2019/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"DataTrieste2019/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot goatbrot1.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue wrapper_montage.sh Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"DataTrieste2019/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"DataTrieste2019/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"DataTrieste2019/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot","title":"Make your job submission files"},{"location":"DataTrieste2019/10-ComplexDAG/#goatbrot1sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"DataTrieste2019/10-ComplexDAG/#goatbrot2sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"DataTrieste2019/10-ComplexDAG/#goatbrot3sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"DataTrieste2019/10-ComplexDAG/#goatbrot4sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"DataTrieste2019/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"montage.sub"},{"location":"DataTrieste2019/10-ComplexDAG/#wrapper_montagesh","text":"Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif","title":"wrapper_montage.sh"},{"location":"DataTrieste2019/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"DataTrieste2019/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"DataTrieste2019/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"DataTrieste2019/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"DataTrieste2019/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up. Challenge If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Handling a DAG that fails"},{"location":"DataTrieste2019/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"DataTrieste2019/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"DataTrieste2019/11-HandlingFailure/#challenge","text":"If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Challenge"},{"location":"DataTrieste2019/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"DataTrieste2019/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"DataTrieste2019/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"DataTrieste2019/12-VariableSubstitution/#declare-your-variables","text":"First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"DataTrieste2019/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"DataTrieste2019/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"DataTrieste2019/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"DataTrieste2019/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"DataTrieste2019/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"DataTrieste2019/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"DataTrieste2019/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"DataTrieste2019/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"DataTrieste2019/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"DataTrieste2019/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"DataTrieste2019/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"DataTrieste2019/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"DataTrieste2019/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"DataTrieste2019/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"DataTrieste2019/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"DataTrieste2019/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"DataTrieste2019/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"DataTrieste2019/Materials/","text":"Data Trieste School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Thursday Morning - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Thursday Morning - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday Afternoon - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Thursday Aftenoon - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Bonus Lecture - Digital Object Architectures DOA and RPID Friday Morning - Computational Infrastructures - Session 5 Introduction to Cloud Computing Friday Morning - Computational Infrastructures - Session 6 Close Out - What to do next? Contact information Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research. If you want long\u2010term OSG access, you can go to http://www.osgconnect.net and sign up. Mention you attended Data Trieste 2019 and want to be added to the DOSAR Project.","title":"Data Trieste 2019 Materials"},{"location":"DataTrieste2019/Materials/#data-trieste-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Data Trieste School Materials"},{"location":"DataTrieste2019/Materials/#thursday-morning-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Thursday Morning - Computational Infrastructures - Session 1"},{"location":"DataTrieste2019/Materials/#thursday-morning-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Thursday Morning - Computational Infrastructures - Session 2"},{"location":"DataTrieste2019/Materials/#thursday-afternoon-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Thursday Afternoon - Computational Infrastructures - Session 3"},{"location":"DataTrieste2019/Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"DataTrieste2019/Materials/#thursday-aftenoon-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Thursday Aftenoon - Computational Infrastructures - Session 4"},{"location":"DataTrieste2019/Materials/#bonus-lecture-digital-object-architectures","text":"DOA and RPID","title":"Bonus Lecture - Digital Object Architectures"},{"location":"DataTrieste2019/Materials/#friday-morning-computational-infrastructures-session-5","text":"Introduction to Cloud Computing","title":"Friday Morning - Computational Infrastructures - Session 5"},{"location":"DataTrieste2019/Materials/#friday-morning-computational-infrastructures-session-6","text":"Close Out - What to do next?","title":"Friday Morning - Computational Infrastructures - Session 6"},{"location":"DataTrieste2019/Materials/#contact-information","text":"Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research. If you want long\u2010term OSG access, you can go to http://www.osgconnect.net and sign up. Mention you attended Data Trieste 2019 and want to be added to the DOSAR Project.","title":"Contact information"},{"location":"DataTrieste2019/School/","text":"Data Trieste 2019 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Disclipline spefic tutorials Close out and resources for further collaboration Friday AM Session Introduction to Cloud Computing Detailed Schedule Thursday 15-December-2019 Time Discription Instructor 08:30 Welcome and the Landscape of Research Computing Rob Quick 09:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 09:30 Profiling your application and finding a home for your workflow Rob Quick 10:00 Exercise - Single and batch submittion with HTCondor Rob Quick 10:30 Coffee Break 11:00 Worflows and distributed environments Rob Quick 11:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 13:00 Lunch 14:00 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 15:00 Exercise - DAGMAN Rob Quick 16:00 Coffee Break 16:30 Containers and HTC Wrap-up Rob Quick 17:00 Exercise - Complete Earlier Lessons Rob Quick Friday 14-December-2018 Time Discription Instructor 08:30 Intorduction to Cloud Computing Alessandro Costantini 10:00 Coffee Break 10:30 Introduction to Cloud Computing Alessandro Costantini 13:30 Lunch 14:00 CODATA Simon Hodson 14:30 Close Out Rob Quick Materials Materials Page","title":"Data Trieste 2019 Schedule"},{"location":"DataTrieste2019/School/#data-trieste-2019","text":"","title":"Data Trieste 2019"},{"location":"DataTrieste2019/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"DataTrieste2019/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"DataTrieste2019/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Disclipline spefic tutorials Close out and resources for further collaboration","title":"Thursday PM Session"},{"location":"DataTrieste2019/School/#friday-am-session","text":"Introduction to Cloud Computing","title":"Friday AM Session"},{"location":"DataTrieste2019/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"DataTrieste2019/School/#thursday-15-december-2019","text":"Time Discription Instructor 08:30 Welcome and the Landscape of Research Computing Rob Quick 09:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 09:30 Profiling your application and finding a home for your workflow Rob Quick 10:00 Exercise - Single and batch submittion with HTCondor Rob Quick 10:30 Coffee Break 11:00 Worflows and distributed environments Rob Quick 11:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 13:00 Lunch 14:00 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 15:00 Exercise - DAGMAN Rob Quick 16:00 Coffee Break 16:30 Containers and HTC Wrap-up Rob Quick 17:00 Exercise - Complete Earlier Lessons Rob Quick","title":"Thursday 15-December-2019"},{"location":"DataTrieste2019/School/#friday-14-december-2018","text":"Time Discription Instructor 08:30 Intorduction to Cloud Computing Alessandro Costantini 10:00 Coffee Break 10:30 Introduction to Cloud Computing Alessandro Costantini 13:30 Lunch 14:00 CODATA Simon Hodson 14:30 Close Out Rob Quick","title":"Friday 14-December-2018"},{"location":"DataTrieste2019/School/#materials","text":"Materials Page","title":"Materials"},{"location":"Materials/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.8 manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 8.2.10, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes. Where you will work Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"High Throughput Computing and Condor Introduction"},{"location":"Materials/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"Materials/01-Introduction/#preliminaries","text":"You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.8 manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"Materials/01-Introduction/#which-condor","text":"We will be using Condor 8.2.10, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes.","title":"Which Condor?"},{"location":"Materials/01-Introduction/#where-you-will-work","text":"Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"Materials/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"The Exercises"},{"location":"Materials/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob. Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.4 Jun 22 2017 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.9 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.9 (Carbon) Release: 6.9 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.4-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor dead.letter execute/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:26:20 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"Materials/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"Materials/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"Materials/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob.","title":"Login to the Condor submit computer"},{"location":"Materials/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.4 Jun 22 2017 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux.","title":"Looking at our Condor installation"},{"location":"Materials/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.9 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.9 (Carbon) Release: 6.9 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.4-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor dead.letter execute/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"Materials/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:26:20 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"Materials/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"Materials/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"Materials/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"Materials/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"Materials/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"DataSaoPaulo\" Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"DataSaoPaulo\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"Materials/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"Materials/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"Materials/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"Materials/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"DataSaoPaulo\" Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"Materials/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"DataSaoPaulo\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"Materials/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"Materials/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"Materials/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"Materials/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"Materials/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"Materials/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 28.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 28.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 28.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 28.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 28.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 28.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can also show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 24.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 24.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"Materials/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"Materials/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"Materials/04-TipsandTricks/#tips-for-condor_q","text":"Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 28.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 28.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 28.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 28.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 28.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 28.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can also show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 24.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 24.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"Materials/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"Materials/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"Materials/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"Materials/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"Materials/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"Materials/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"Materials/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. You will need to include the following line in your submit file (before the \"queue\" statement) to make sure that Condor looks for a resource that uses OASIS: requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) queue","title":"Running a job with R"},{"location":"Materials/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"Materials/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"Materials/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"Materials/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"Materials/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"Materials/06-RJob/#setup","text":"OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"Materials/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. You will need to include the following line in your submit file (before the \"queue\" statement) to make sure that Condor looks for a resource that uses OASIS: requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"Materials/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) queue","title":"The answer"},{"location":"Materials/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"DataSaoPaulo\" ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"Materials/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"Materials/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"Materials/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"DataSaoPaulo\" ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"Materials/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"Materials/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot , A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Move file to local machine for viewing cp mandle.gif ~/public 1. This time, rather than copying your file to your local system, point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"A brief detour through the Mandlebrot set"},{"location":"Materials/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot ,","title":"A brief detour through the Mandlebrot set"},{"location":"Materials/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"Materials/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"Materials/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Move file to local machine for viewing cp mandle.gif ~/public 1. This time, rather than copying your file to your local system, point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"Try it!"},{"location":"Materials/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here? Challenge What is the scheduler universe? Why does DAGMan use it?","title":"Coordinating set of jobs: A simple DAG"},{"location":"Materials/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"Materials/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"Materials/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"Materials/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"Materials/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"Materials/09-SimpleDAG/#challenge","text":"What is the scheduler universe? Why does DAGMan use it?","title":"Challenge"},{"location":"Materials/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot goatbrot1.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue wrapper_montage.sh Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch -n 10 condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"Materials/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"Materials/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"Materials/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot","title":"Make your job submission files"},{"location":"Materials/10-ComplexDAG/#goatbrot1sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"Materials/10-ComplexDAG/#goatbrot2sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"Materials/10-ComplexDAG/#goatbrot3sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"Materials/10-ComplexDAG/#goatbrot4sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"Materials/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue","title":"montage.sub"},{"location":"Materials/10-ComplexDAG/#wrapper_montagesh","text":"Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif","title":"wrapper_montage.sh"},{"location":"Materials/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"Materials/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"Materials/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch -n 10 condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"Materials/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"Materials/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up. Challenge If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Handling a DAG that fails"},{"location":"Materials/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"Materials/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"Materials/11-HandlingFailure/#challenge","text":"If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Challenge"},{"location":"Materials/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"Materials/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"Materials/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"Materials/12-VariableSubstitution/#declare-your-variables","text":"First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"Materials/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"Materials/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"Materials/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"Materials/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"Materials/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"Materials/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"Materials/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"Materials/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"Materials/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"Materials/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"Materials/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"Materials/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"Materials/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"Materials/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"Materials/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"Materials/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"Materials/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"Materials/DSP_DT2017/","text":"Data Sao Paulo 2017 High-Level Curriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Brief Introduction to clouds and containers Friday AM Session UNESP Presentation and Tour Close out and resources for further collaboration Detailed Schedule Thursday 20-July 2017 Time Discription Instructor 09:30 Welcome and the Landscape of Research Computing Rob Quick 10:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 11:00 Coffee Break 11:30 Profiling your application and finding a home for your workflow Rob Quick 12:00 Exercise - Single and batch submittion with HTCondor Rob Quick 12:30 Lunch 13:30 Worflows and distributed environments Rob Quick 14:00 Exercise - Accessing the Open Science Grid and DAGMan Rob Quick 15:00 Coffee Break 15:30 A Brief Introduction to Clouds and Containers Rob Quick 16:00 Exercise - Using Containers on OSG and Discpline Specfic Tutorials Rob Quick Friday 21-July 2017 Time Discription Instructor 09:30 Introduction and Tour of Grid UNESP Facilities Raphael C\u00f3be 11:00 Coffee Break 11:30 Wrap of Computational Infrastructures Rob Quick 12:00 Closing Thoughts Rob Quick 12:30 Adjourn Materials Materials Page","title":"Data Sao Paulo Schedule"},{"location":"Materials/DSP_DT2017/#data-sao-paulo-2017","text":"","title":"Data Sao Paulo 2017"},{"location":"Materials/DSP_DT2017/#high-level-curriculum-overview","text":"","title":"High-Level Curriculum Overview"},{"location":"Materials/DSP_DT2017/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"Materials/DSP_DT2017/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Brief Introduction to clouds and containers","title":"Thursday PM Session"},{"location":"Materials/DSP_DT2017/#friday-am-session","text":"UNESP Presentation and Tour Close out and resources for further collaboration","title":"Friday AM Session"},{"location":"Materials/DSP_DT2017/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"Materials/DSP_DT2017/#thursday-20-july-2017","text":"Time Discription Instructor 09:30 Welcome and the Landscape of Research Computing Rob Quick 10:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 11:00 Coffee Break 11:30 Profiling your application and finding a home for your workflow Rob Quick 12:00 Exercise - Single and batch submittion with HTCondor Rob Quick 12:30 Lunch 13:30 Worflows and distributed environments Rob Quick 14:00 Exercise - Accessing the Open Science Grid and DAGMan Rob Quick 15:00 Coffee Break 15:30 A Brief Introduction to Clouds and Containers Rob Quick 16:00 Exercise - Using Containers on OSG and Discpline Specfic Tutorials Rob Quick","title":"Thursday 20-July 2017"},{"location":"Materials/DSP_DT2017/#friday-21-july-2017","text":"Time Discription Instructor 09:30 Introduction and Tour of Grid UNESP Facilities Raphael C\u00f3be 11:00 Coffee Break 11:30 Wrap of Computational Infrastructures Rob Quick 12:00 Closing Thoughts Rob Quick 12:30 Adjourn","title":"Friday 21-July 2017"},{"location":"Materials/DSP_DT2017/#materials","text":"Materials Page","title":"Materials"},{"location":"Materials/DSP_Materials/","text":"Data Sao Paulo School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Thursday Morning - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Thursday Morning - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday Afternoon - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Thursday Aftenoon - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Friday Morning - Computational Infrastructures - Session 5 Grid UNESP Presentation Friday Morning - Computational Infrastructures - Session 6 Computational Infrastructures Wrap Up - Slides Contact information Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Data Sao Paulo Materials"},{"location":"Materials/DSP_Materials/#data-sao-paulo-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Data Sao Paulo School Materials"},{"location":"Materials/DSP_Materials/#thursday-morning-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Thursday Morning - Computational Infrastructures - Session 1"},{"location":"Materials/DSP_Materials/#thursday-morning-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Thursday Morning - Computational Infrastructures - Session 2"},{"location":"Materials/DSP_Materials/#thursday-afternoon-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Thursday Afternoon - Computational Infrastructures - Session 3"},{"location":"Materials/DSP_Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"Materials/DSP_Materials/#thursday-aftenoon-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Thursday Aftenoon - Computational Infrastructures - Session 4"},{"location":"Materials/DSP_Materials/#friday-morning-computational-infrastructures-session-5","text":"Grid UNESP Presentation","title":"Friday Morning - Computational Infrastructures - Session 5"},{"location":"Materials/DSP_Materials/#friday-morning-computational-infrastructures-session-6","text":"Computational Infrastructures Wrap Up - Slides","title":"Friday Morning - Computational Infrastructures - Session 6"},{"location":"Materials/DSP_Materials/#contact-information","text":"Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"Materials/PH_DT2017/","text":"Polar Hackathon 2017 High-Level Cirriculum Overview Wednesday Session 1 Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Wednesday Session 2 Workflows and distributed environments Thursday Sessions Containers Jetstream demo and discussion Materials Materials Page","title":"Polar Hackathon Schedule"},{"location":"Materials/PH_DT2017/#polar-hackathon-2017","text":"","title":"Polar Hackathon 2017"},{"location":"Materials/PH_DT2017/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"Materials/PH_DT2017/#wednesday-session-1","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Wednesday Session 1"},{"location":"Materials/PH_DT2017/#wednesday-session-2","text":"Workflows and distributed environments","title":"Wednesday Session 2"},{"location":"Materials/PH_DT2017/#thursday-sessions","text":"Containers Jetstream demo and discussion","title":"Thursday Sessions"},{"location":"Materials/PH_DT2017/#materials","text":"Materials Page","title":"Materials"},{"location":"Materials/PH_Materials/","text":"Polar Hackathon School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Wednesday - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Wednesday - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday - Computational Infrastructures - Session 3 Lecture 4 - Slides Containers with Singularity Jetstream Demo Possible if Time - Workflows Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow Extras Discipline specific tutorials Contact information Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Polar Hackathon Materials"},{"location":"Materials/PH_Materials/#polar-hackathon-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Polar Hackathon School Materials"},{"location":"Materials/PH_Materials/#wednesday-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Wednesday - Computational Infrastructures - Session 1"},{"location":"Materials/PH_Materials/#wednesday-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Wednesday - Computational Infrastructures - Session 2"},{"location":"Materials/PH_Materials/#thursday-computational-infrastructures-session-3","text":"Lecture 4 - Slides Containers with Singularity Jetstream Demo","title":"Thursday - Computational Infrastructures - Session 3"},{"location":"Materials/PH_Materials/#possible-if-time-workflows","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow","title":"Possible if Time - Workflows"},{"location":"Materials/PH_Materials/#extras","text":"Discipline specific tutorials","title":"Extras"},{"location":"Materials/PH_Materials/#contact-information","text":"Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"Materials/School/","text":"Data Trieste 2017 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Friday AM Session Introduction to cloud environemts Close out and resources for further collaboration Detailed Schedule Thursday 20-July 2017 Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick Friday 21-July 2017 Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch Materials Materials Page","title":"Data Trieste 2017"},{"location":"Materials/School/#data-trieste-2017","text":"","title":"Data Trieste 2017"},{"location":"Materials/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"Materials/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"Materials/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Thursday PM Session"},{"location":"Materials/School/#friday-am-session","text":"Introduction to cloud environemts Close out and resources for further collaboration","title":"Friday AM Session"},{"location":"Materials/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"Materials/School/#thursday-20-july-2017","text":"Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick","title":"Thursday 20-July 2017"},{"location":"Materials/School/#friday-21-july-2017","text":"Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch","title":"Friday 21-July 2017"},{"location":"Materials/School/#materials","text":"Materials Page","title":"Materials"},{"location":"Meetings/12March2019/","text":"DOSAR Group Meeting Tuesday, March 12 2019 10:30 ET, 9:30 CT Meeting Coordinates: https://iu.zoom.us/my/rquick Attendees: Apologies: Upcoming Events ASP2020 Site Planning Visit - Marrakech/Rabat, Morocco - April 1 to 5 - 2019 CODATA/RDA School of Research Data Science - Trieste, Italy - August 5 to August 16 - 2019 New Initiatives","title":"March 12, 2019"},{"location":"Meetings/12March2019/#dosar-group-meeting","text":"Tuesday, March 12 2019 10:30 ET, 9:30 CT Meeting Coordinates: https://iu.zoom.us/my/rquick Attendees: Apologies:","title":"DOSAR Group Meeting"},{"location":"Meetings/12March2019/#upcoming-events","text":"ASP2020 Site Planning Visit - Marrakech/Rabat, Morocco - April 1 to 5 - 2019 CODATA/RDA School of Research Data Science - Trieste, Italy - August 5 to August 16 - 2019","title":"Upcoming Events"},{"location":"Meetings/12March2019/#new-initiatives","text":"","title":"New Initiatives"},{"location":"Meetings/24October2018/","text":"DOSAR Group Meeting Wednesday, October 24 12:00 ET, 11:00 CT Meeting Coordinates: https://iu.zoom.us/my/rquick Attendees: Apologies: Upcoming Events CODATA/RDA School of Research Data Science - Kigali, Rwanda - October 22 to November 2 -2018 CODATA/RDA School of Research Data Science - Sao Paulo, Brazil - December 3 to December 14 - 2018 CODATA/RDA School of Research Data Science - Trieste, Italy - August 5 to August 16 - 2019 New Initiatives","title":"October 24, 2018"},{"location":"Meetings/24October2018/#dosar-group-meeting","text":"Wednesday, October 24 12:00 ET, 11:00 CT Meeting Coordinates: https://iu.zoom.us/my/rquick Attendees: Apologies:","title":"DOSAR Group Meeting"},{"location":"Meetings/24October2018/#upcoming-events","text":"CODATA/RDA School of Research Data Science - Kigali, Rwanda - October 22 to November 2 -2018 CODATA/RDA School of Research Data Science - Sao Paulo, Brazil - December 3 to December 14 - 2018 CODATA/RDA School of Research Data Science - Trieste, Italy - August 5 to August 16 - 2019","title":"Upcoming Events"},{"location":"Meetings/24October2018/#new-initiatives","text":"","title":"New Initiatives"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"DOSAR Outreach Projects The Distributed Organization for Scientific and Academic Research (DOSAR) is a 'grass-roots' cyberinfrastructure organization that focuses on community and campus based cyberinfrastructure and promotes a wide range of interdisciplinary and educational activities within the organization and its member institutions. Ongoing Events CODATA-RDA School of Research Data Science - December 2-13, 2019 - San Jose, Costa Rica Upcoming Events CODATA-RDA School of Research Data Science - January 2020, Petoria, South Africa Past Events CODATA-RDA School of Research Data Science - August 5-16, 2019 - Trieste, Italy CODATA-RDA School of Research Data Science - December 3-14, 2018 - Sao Paulo, Brazil CODATA-RDA School of Research Data Science - October 22 - November 2, 2018 - Kigali, Rwanda CODATA-RDA School of Research Data Science - August 6-17, 2018 - Trieste, Italy The African School of Physics (ASP) - June 24-July 14, 2018 - UNAM, Windhoek, Namibia CODATA-RDA School of Research Data Science - December 4-15, 2017 - Sao Paulo, Brazil Polar Hackathon - August 1-4, 2017 - Stony Brook, NY CODATA-RDA School of Research Data Science - July 10-21, 2017 - Trieste, Italy","title":"Home"},{"location":"#dosar-outreach-projects","text":"The Distributed Organization for Scientific and Academic Research (DOSAR) is a 'grass-roots' cyberinfrastructure organization that focuses on community and campus based cyberinfrastructure and promotes a wide range of interdisciplinary and educational activities within the organization and its member institutions.","title":"DOSAR Outreach Projects"},{"location":"#ongoing-events","text":"CODATA-RDA School of Research Data Science - December 2-13, 2019 - San Jose, Costa Rica","title":"Ongoing Events"},{"location":"#upcoming-events","text":"CODATA-RDA School of Research Data Science - January 2020, Petoria, South Africa","title":"Upcoming Events"},{"location":"#past-events","text":"CODATA-RDA School of Research Data Science - August 5-16, 2019 - Trieste, Italy CODATA-RDA School of Research Data Science - December 3-14, 2018 - Sao Paulo, Brazil CODATA-RDA School of Research Data Science - October 22 - November 2, 2018 - Kigali, Rwanda CODATA-RDA School of Research Data Science - August 6-17, 2018 - Trieste, Italy The African School of Physics (ASP) - June 24-July 14, 2018 - UNAM, Windhoek, Namibia CODATA-RDA School of Research Data Science - December 4-15, 2017 - Sao Paulo, Brazil Polar Hackathon - August 1-4, 2017 - Stony Brook, NY CODATA-RDA School of Research Data Science - July 10-21, 2017 - Trieste, Italy","title":"Past Events"},{"location":"ASP2018/","text":"Placeholder for ASP 2018 Lessons","title":"Index"},{"location":"ASP2018/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6 manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes. Where you will work Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"High Throughput Computing and Condor Introduction"},{"location":"ASP2018/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"ASP2018/01-Introduction/#preliminaries","text":"You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6 manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"ASP2018/01-Introduction/#which-condor","text":"We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes.","title":"Which Condor?"},{"location":"ASP2018/01-Introduction/#where-you-will-work","text":"Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"ASP2018/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"The Exercises"},{"location":"ASP2018/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password previously. If you don't know them, talk to us. Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.9 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.9 (Carbon) Release: 6.9 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor dead.letter execute/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_collector. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:26:20 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"ASP2018/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"ASP2018/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"ASP2018/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password previously. If you don't know them, talk to us.","title":"Login to the Condor submit computer"},{"location":"ASP2018/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux.","title":"Looking at our Condor installation"},{"location":"ASP2018/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.9 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.9 (Carbon) Release: 6.9 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor dead.letter execute/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_collector. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"ASP2018/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:26:20 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"ASP2018/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"ASP2018/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"ASP2018/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"ASP2018/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"ASP2018/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Let's create a directory to perform the condor tests. Execute the following two commands in your Linux shell: $ mkdir -p ~/condor-test $ cd ~/condor-test Next, create a file called simple.c using your favorite editor. Use your preferred text editor to create this C program. If you don't have one, use 'nano', as shown below. The file will be in the condor-test directory. In that file, put the following text. Copy and paste is a good choice: $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 2056. Now, watch your job run (insert your username in the command below instead of YOUR_USER_ID . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (2056.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (2056.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (2056.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (2056.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 34. $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q YOUR_USERID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"ASP2018/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"ASP2018/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"ASP2018/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Let's create a directory to perform the condor tests. Execute the following two commands in your Linux shell: $ mkdir -p ~/condor-test $ cd ~/condor-test Next, create a file called simple.c using your favorite editor. Use your preferred text editor to create this C program. If you don't have one, use 'nano', as shown below. The file will be in the condor-test directory. In that file, put the following text. Copy and paste is a good choice: $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"ASP2018/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 2056. Now, watch your job run (insert your username in the command below instead of YOUR_USER_ID . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (2056.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (2056.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (2056.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (2056.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"ASP2018/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 34. $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q YOUR_USERID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"ASP2018/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"ASP2018/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"ASP2018/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"ASP2018/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"ASP2018/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"ASP2018/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 23.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 23.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 23.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 23.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 23.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 23.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 23.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 23.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history YOUR_USER_ID For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"ASP2018/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"ASP2018/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"ASP2018/04-TipsandTricks/#tips-for-condor_q","text":"Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 23.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 23.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 23.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 23.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 23.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 23.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 23.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 23.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"ASP2018/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"ASP2018/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history YOUR_USER_ID For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"ASP2018/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you would like to a write program in the Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"ASP2018/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"ASP2018/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you would like to a write program in the Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"ASP2018/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"ASP2018/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask us. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. You will need to include the following line in your submit file (before the \"queue\" statement) to make sure that Condor looks for a resource that uses OASIS: requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) queue","title":"Running a job with R"},{"location":"ASP2018/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"ASP2018/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"ASP2018/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"ASP2018/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"ASP2018/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"ASP2018/06-RJob/#setup","text":"OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask us. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"ASP2018/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. You will need to include the following line in your submit file (before the \"queue\" statement) to make sure that Condor looks for a resource that uses OASIS: requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"ASP2018/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) queue","title":"The answer"},{"location":"ASP2018/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"ASP2018/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"ASP2018/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"ASP2018/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"ASP2018/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"ASP2018/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot . A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of YOUR_USER_ID ): $ scp YOUR_USER_ID@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run each goatbroat in parallel in our cluster. Here's an example you can run by hand. (This is back in your first terminal, where you are logged into the osgconnect machine.) Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Copy file to your public area: cp mandle.gif ~/public 1. This time, rather than copying your file to your local system, point your browser at the stash web server: http://stash.osgconnect.net/~YOUR_USER_ID","title":"A brief detour through the Mandlebrot set"},{"location":"ASP2018/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot .","title":"A brief detour through the Mandlebrot set"},{"location":"ASP2018/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of YOUR_USER_ID ): $ scp YOUR_USER_ID@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"ASP2018/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run each goatbroat in parallel in our cluster. Here's an example you can run by hand. (This is back in your first terminal, where you are logged into the osgconnect machine.) Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"ASP2018/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Copy file to your public area: cp mandle.gif ~/public 1. This time, rather than copying your file to your local system, point your browser at the stash web server: http://stash.osgconnect.net/~YOUR_USER_ID","title":"Try it!"},{"location":"ASP2018/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file (called submit ) that has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . JOB simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue. As before, replace YOUR_USER_ID with your username. (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q YOUR_USER_ID -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here? Challenge What is the scheduler universe? Why does DAGMan use it?","title":"Coordinating set of jobs: A simple DAG"},{"location":"ASP2018/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"ASP2018/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"ASP2018/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"ASP2018/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file (called submit ) that has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . JOB simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue. As before, replace YOUR_USER_ID with your username. (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q YOUR_USER_ID -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"ASP2018/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"ASP2018/09-SimpleDAG/#challenge","text":"What is the scheduler universe? Why does DAGMan use it?","title":"Challenge"},{"location":"ASP2018/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot goatbrot1.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh +ProjectName = \"osg.ConnectTrain\" arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue wrapper_montage.sh Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash module load imagemagick montage $* Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch -n 10 condor_q YOUR_USER_ID -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~YOUR_USER_ID . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"ASP2018/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"ASP2018/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"ASP2018/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot","title":"Make your job submission files"},{"location":"ASP2018/10-ComplexDAG/#goatbrot1sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"ASP2018/10-ComplexDAG/#goatbrot2sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"ASP2018/10-ComplexDAG/#goatbrot3sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"ASP2018/10-ComplexDAG/#goatbrot4sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"ASP2018/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh +ProjectName = \"osg.ConnectTrain\" arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue","title":"montage.sub"},{"location":"ASP2018/10-ComplexDAG/#wrapper_montagesh","text":"Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash module load imagemagick montage $*","title":"wrapper_montage.sh"},{"location":"ASP2018/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"ASP2018/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"ASP2018/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch -n 10 condor_q YOUR_USER_ID -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~YOUR_USER_ID . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"ASP2018/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"ASP2018/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this: universe = vanilla executable = wrapper_montage.sh +ProjectName = \"osg.ConnectTrain\" arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh +ProjectName = \"osg.ConnectTrain\" arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up. Challenge If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Handling a DAG that fails"},{"location":"ASP2018/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"ASP2018/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this: universe = vanilla executable = wrapper_montage.sh +ProjectName = \"osg.ConnectTrain\" arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh +ProjectName = \"osg.ConnectTrain\" arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"ASP2018/11-HandlingFailure/#challenge","text":"If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Challenge"},{"location":"ASP2018/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"ASP2018/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"ASP2018/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"ASP2018/12-VariableSubstitution/#declare-your-variables","text":"First declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"ASP2018/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"ASP2018/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"ASP2018/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"ASP2018/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"ASP2018/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"ASP2018/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"ASP2018/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"ASP2018/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"ASP2018/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"ASP2018/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"ASP2018/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"ASP2018/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"ASP2018/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"ASP2018/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"ASP2018/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"ASP2018/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"ASP2018/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"ASP2018/ASP2018_Materials/","text":"ASP 2018 Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Tuesday Morning - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Tuesday Afternoon - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Tuesday Afternoon - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Wednesday Morning - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Wednesday Morning - Computational Infrastructures - Session 5 ATLAS Analysis Example Lecture ATLAS Analysis Example Exercise Wednesday Afternoon - Computational Infrastructures - Session 6 Computational Infrastructures Wrap Up - Slides Contact information Rob Quick - rquick@iu.edu Horst Severini - severini@ou.edu Patrick Skubic - pskubic@ou.edu Julia Gray - julia.ann.gray@gmail.com Jae Yu - jaehoonyu@uta.edu Dick Greenwood - greenw@phys.latech.edu Chris Walker - walker@nhn.ou.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the African School of Physics 2018. If you want long\u2010term grid access, you can go to http://www.osgconnect.net/ and sign up","title":"ASP 2018 Material"},{"location":"ASP2018/ASP2018_Materials/#asp-2018-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"ASP 2018 Materials"},{"location":"ASP2018/ASP2018_Materials/#tuesday-morning-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Tuesday Morning - Computational Infrastructures - Session 1"},{"location":"ASP2018/ASP2018_Materials/#tuesday-afternoon-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Tuesday Afternoon - Computational Infrastructures - Session 2"},{"location":"ASP2018/ASP2018_Materials/#tuesday-afternoon-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Tuesday Afternoon - Computational Infrastructures - Session 3"},{"location":"ASP2018/ASP2018_Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"ASP2018/ASP2018_Materials/#wednesday-morning-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Wednesday Morning - Computational Infrastructures - Session 4"},{"location":"ASP2018/ASP2018_Materials/#wednesday-morning-computational-infrastructures-session-5","text":"ATLAS Analysis Example Lecture ATLAS Analysis Example Exercise","title":"Wednesday Morning - Computational Infrastructures - Session 5"},{"location":"ASP2018/ASP2018_Materials/#wednesday-afternoon-computational-infrastructures-session-6","text":"Computational Infrastructures Wrap Up - Slides","title":"Wednesday Afternoon - Computational Infrastructures - Session 6"},{"location":"ASP2018/ASP2018_Materials/#contact-information","text":"Rob Quick - rquick@iu.edu Horst Severini - severini@ou.edu Patrick Skubic - pskubic@ou.edu Julia Gray - julia.ann.gray@gmail.com Jae Yu - jaehoonyu@uta.edu Dick Greenwood - greenw@phys.latech.edu Chris Walker - walker@nhn.ou.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the African School of Physics 2018. If you want long\u2010term grid access, you can go to http://www.osgconnect.net/ and sign up","title":"Contact information"},{"location":"ASP2018/ASP2018_Schedule/","text":"ASP 2018 High-Level Curriculum Overview Tuesday AM Session Welcome and Background The Landscape of Research Computing Tuesday PM Session Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Wednesday AM Session Brief Introduction to clouds and containers ATLAS Analysis Example Wednesday PM Session Close out and resources for further collaboration Detailed Schedule Tuesday 10-July 2018 Time Description Instructor 10:50 Welcome and the Landscape of Research Computing Pat Skubic 11:35 Exercise - UNIX Refresher, Running simple Condor jobs All 12:20 Lunch 13:40 Profiling your application and finding a home for your workflow Julia Gray 14:25 Exercise - Running scripting and R jobs with Condor All 15:10 Coffee Break 15:30 Workflows and distributed environments Horst Severini 16:15 Exercise - DAGMan All Wednesday 11-July 2018 Time Description Instructor 09:00 A Brief Introduction to Clouds and Containers Horst Severini 09:45 Exercise - Using Containers on OSG and Discpline Specfic Tutorials All 10:30 Coffee Break 10:50 ATLAS Analysis Example Pat Skubic 11:35 Exercises All 12:20 Lunch 13:40 Closing Thoughts Pat Skubic Materials Materials Page","title":"ASP 2018 Schedule"},{"location":"ASP2018/ASP2018_Schedule/#asp-2018","text":"","title":"ASP 2018"},{"location":"ASP2018/ASP2018_Schedule/#high-level-curriculum-overview","text":"","title":"High-Level Curriculum Overview"},{"location":"ASP2018/ASP2018_Schedule/#tuesday-am-session","text":"Welcome and Background The Landscape of Research Computing","title":"Tuesday AM Session"},{"location":"ASP2018/ASP2018_Schedule/#tuesday-pm-session","text":"Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Tuesday PM Session"},{"location":"ASP2018/ASP2018_Schedule/#wednesday-am-session","text":"Brief Introduction to clouds and containers ATLAS Analysis Example","title":"Wednesday AM Session"},{"location":"ASP2018/ASP2018_Schedule/#wednesday-pm-session","text":"Close out and resources for further collaboration","title":"Wednesday PM Session"},{"location":"ASP2018/ASP2018_Schedule/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"ASP2018/ASP2018_Schedule/#tuesday-10-july-2018","text":"Time Description Instructor 10:50 Welcome and the Landscape of Research Computing Pat Skubic 11:35 Exercise - UNIX Refresher, Running simple Condor jobs All 12:20 Lunch 13:40 Profiling your application and finding a home for your workflow Julia Gray 14:25 Exercise - Running scripting and R jobs with Condor All 15:10 Coffee Break 15:30 Workflows and distributed environments Horst Severini 16:15 Exercise - DAGMan All","title":"Tuesday 10-July 2018"},{"location":"ASP2018/ASP2018_Schedule/#wednesday-11-july-2018","text":"Time Description Instructor 09:00 A Brief Introduction to Clouds and Containers Horst Severini 09:45 Exercise - Using Containers on OSG and Discpline Specfic Tutorials All 10:30 Coffee Break 10:50 ATLAS Analysis Example Pat Skubic 11:35 Exercises All 12:20 Lunch 13:40 Closing Thoughts Pat Skubic","title":"Wednesday 11-July 2018"},{"location":"ASP2018/ASP2018_Schedule/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2018/AnalysisExample/","text":"ATLAS Analysis Example Introduction Root may be run in batch mode on the grid to analyze large data samples. This example creates simulated data in root format using trees and performs analysis on the simulated data by means of processing on the grid. This example is based on a demo developed by OU programmer Chris Walker. Prerequisite Login on submission node $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net Make a directory for this exercise $ mkdir -p analysis_example $ cd analysis_example Again the $ sign at the beginning of the commands to execute is the command prompt , so it should not be entered as part of the command. Simple Analysis Example Step 1: Create simulated data using the grid Now in your test directory on the submission host we will create the three files: run-root.cmd , run-root.sh , and run-root.C with the contents given below. This may require running an editor such as emacs on your local desktop and then copying the created files to the submission host. Or the nano editor can be run directly on the submission host. A typical copy command would be as follows. $ scp run-root.* YOUR_USER_ID@user-training.osgconnect.net:analysis_example/ It is probably easier to create all scripts with nano on the submission node, though, and then you won't have to copy ( scp ) anything at all. So everything below assumes you are logged on to a terminal session on the submission node. First, we will utilize a simple command script to submit the grid jobs. It is run-root.cmd : universe=vanilla executable=run-root.sh transfer_input_files = run-root.C transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root.log transfer_output_files = root.out,t00.root,t01.root output=run-root.out.$(Cluster).$(Process) error=run-root.err.$(Cluster).$(Process) notification=Never queue Note that the executable script is: run-root.sh which is as follows: #!/bin/bash source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load root module load libXpm root -b < run-root.C > root.out This script runs Root in batch mode and executes input macro run-root.C and produces output that is routed to file root.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-root.sh The macro run-root.C consists of the following code: { // create files containing simulated data TRandom g; char c[256]; for ( int j = 0 ; j < 2 ; j++ ){ sprintf(c,\"t%2.2d.root\\000\",j); TFile f(c,\"RECREATE\",\"MyFile\", 0/*no compression*/); TTree *t = new TTree(\"t0\",\"t0\"); Int_t Run; TBranch * b_Run = t->Branch(\"Run\",&Run); Int_t Event; TBranch * b_Event = t->Branch(\"Event\",&Event); Float_t Energy; TBranch * b_Energy = t->Branch(\"Energy\",&Energy); Run = j; for( Event = 0 ; Event < 100 ; Event++ ){ Energy = g.Gaus(500.0 , 200.0); t->Fill(); } f.Write(); f.Close(); } } .q The grid job can be submitted using: $ condor_submit run-root.cmd It can be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-root.log , and output file: root.out , and the files containing the simulated data: t00.root , t01.root in your test directory. You need to copy these files into your public directory, so that you can download it to your local desktop: $ cp t0*.root ~/public/ Now open a different terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/t00.root http://stash.osgconnect.net/~YOUR_USER_ID/t01.root You can then inspect the contents of t00.root and t01.root by running Root in your current directory in the local terminal window in which you just ran the wget command: $ root t00.root And then the Root command: TBrowser b With the TBrowser you can plot the simulated data in branch Energy as well as the other branches. Double click on the name of the root files, and then on the variables you would like to plot. Each data file contains a TTree named t0 . You can plot the contents of all (in this example both) data file TTree's by using the TChain method as follows: In Root execute the following commands: TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Draw(\"Energy\"); When you are done with this, you can quit root again with the command .q <Return> . Step 2: Analyze Real Data Now we want to have a look at a real live ATLAS root file. For this, go back to the remote terminal window on osgconnect. You will need a new condor submit script called run-z.cmd : universe=vanilla executable=run-z.sh transfer_input_files = readEvents.C,/home/pskubic/public/muons.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-z.log transfer_output_files = root-z.out,histograms-z.root output=run-z.out.$(Cluster).$(Process) error=run-z.err.$(Cluster).$(Process) notification=Never queue The new executable script you need for this job is: run-z.sh which is as follows: #!/bin/bash source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load root module load libXpm root -b -q readEvents.C+ > root-z.out This script runs Root in batch mode and executes input macro readEvents.C and produces output that is routed to file root-z.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-z.sh The macro readEvents.C consists of the following code: #include \"TFile.h\" #include \"TTree.h\" #include \"TCanvas.h\" #include \"TH1F.h\" #include \"iostream\" //#include \"TLorentzVector.h\" using namespace std; void readEvents(){ // load the ROOT ntuple file TFile * f = new TFile(\"muons.root\"); TTree *tree = (TTree *) f->Get(\"POOLCollectionTree\"); int nEntries = tree->GetEntries(); cout << \"There are \" << nEntries << \" entries in your ntuple\" << endl; // create local variables for the tree's branches UInt_t NLooseMuons; Float_t LooseMuonsEta1; Float_t LooseMuonsPhi1; Float_t LooseMuonsPt1; Float_t LooseMuonsEta2; Float_t LooseMuonsPhi2; Float_t LooseMuonsPt2; // set the tree's branches to the local variables tree->SetBranchAddress(\"NLooseMuon\", &NLooseMuons); tree->SetBranchAddress(\"LooseMuonEta1\", &LooseMuonsEta1); tree->SetBranchAddress(\"LooseMuonPhi1\", &LooseMuonsPhi1); tree->SetBranchAddress(\"LooseMuonPt1\", &LooseMuonsPt1); tree->SetBranchAddress(\"LooseMuonEta2\", &LooseMuonsEta2); tree->SetBranchAddress(\"LooseMuonPhi2\", &LooseMuonsPhi2); tree->SetBranchAddress(\"LooseMuonPt2\", &LooseMuonsPt2); // declare some histograms TH1F *muPt1 = new TH1F(\"muPt1\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx1 = new TH1F(\"muPx1\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy1 = new TH1F(\"muPy1\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz1 = new TH1F(\"muPz1\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta1 = new TH1F(\"muEta1\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi1 = new TH1F(\"muPhi1\", \";#phi;Events\", 50, -4, 4); TH1F *muE1 = new TH1F(\"muE1\", \";Energy;Events\", 50, 0, 200); TH1F *muPt2 = new TH1F(\"muPt2\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx2 = new TH1F(\"muPx2\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy2 = new TH1F(\"muPy2\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz2 = new TH1F(\"muPz2\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta2 = new TH1F(\"muEta2\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi2 = new TH1F(\"muPhi2\", \";#phi;Events\", 50, -4, 4); TH1F *muE2 = new TH1F(\"muE2\", \";Energy;Events\", 50, 0, 200); TH1F *zPt = new TH1F(\"zPt\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *zPx = new TH1F(\"zPx\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *zPy = new TH1F(\"zPy\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *zPz = new TH1F(\"zPz\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz //TH1F *zEta = new TH1F(\"zEta\", \";#eta;Events\", 50, -3, 3); //TH1F *zPhi = new TH1F(\"zPhi\", \";#phi;Events\", 50, -4, 4); TH1F *zE = new TH1F(\"zE\", \";Energy;Events\", 50, 0, 200); TH1F *zMass = new TH1F(\"zMass\", \";Mass;Events\", 50, 0, 200); // loop over each entry (event) in the tree for( int entry=0; entry < nEntries; entry++ ){ if( entry%10000 * 0 ) cout << \"Entry:\" << entry << endl; // check that the event is read properly int entryCheck = tree->GetEntry( entry ); if( entryCheck <= 0 ){ continue; } // only look at events containing at least 2 leptons if(NLooseMuons < 2) continue; // require the leptons to have some transverse momentum if(abs(LooseMuonsPt1) *0.001 < 20 || abs(LooseMuonsPt2) *0.001 < 20 ) continue; // make a LorentzVector from the muon //TLorentzVector Muons1; // Muons1.SetPtEtaPhiM(fabs(LooseMuonsPt1), LooseMuonsEta1, LooseMuonsPhi1, 0); // print out the details of an electron every so often if( entry%10000 * 0 ){ cout << \"Muons pt1: \" << LooseMuonsPt1 << \" eta: \" << LooseMuonsEta1 << \" phi \" << LooseMuonsPhi1 << endl; cout << \"Muons pt2: \" << LooseMuonsPt2 << \" eta: \" << LooseMuonsEta2 << \" phi \" << LooseMuonsPhi2 << endl; } //calculation of muon energy Double_t muonMass = 0.0; // assume the mass of the muon is negligible Double_t muonPx1 = abs(LooseMuonsPt1)*cos(LooseMuonsPhi1); Double_t muonPy1 = abs(LooseMuonsPt1)*sin(LooseMuonsPhi1); Double_t muonPz1 = abs(LooseMuonsPt1)*sinh(LooseMuonsEta1); Double_t muonEnergy1 = sqrt (muonPx1*muonPx1 + muonPy1*muonPy1 + muonPz1*muonPz1 + muonMass*muonMass); Double_t muonPx2 = abs(LooseMuonsPt2)*cos(LooseMuonsPhi2); Double_t muonPy2 = abs(LooseMuonsPt2)*sin(LooseMuonsPhi2); Double_t muonPz2 = abs(LooseMuonsPt2)*sinh(LooseMuonsEta2); Double_t muonEnergy2 = sqrt (muonPx2*muonPx2 + muonPy2*muonPy2 + muonPz2*muonPz2 + muonMass*muonMass); Double_t zCompX = muonPx1 + muonPx2; Double_t zCompY = muonPy1 + muonPy2; Double_t zLongi = muonPz1 + muonPz2; Double_t zPerp = sqrt (zCompX*zCompX + zCompY*zCompY); Double_t zEnergy = muonEnergy1 + muonEnergy2; Double_t zM = sqrt (zEnergy*zEnergy -zCompX*zCompX -zCompY*zCompY -zLongi*zLongi); // fill our histograms muPt1->Fill((LooseMuonsPt1)*0.001); // in GeV muEta1->Fill(LooseMuonsEta1); muPhi1->Fill(LooseMuonsPhi1); muPx1->Fill( muonPx1*0.001); // in GeV muPy1->Fill( muonPy1*0.001); // in GeV muPz1->Fill( muonPz1*0.001); // in GeV muE1->Fill(muonEnergy1*0.001); // in GeV muPt2->Fill((LooseMuonsPt2)*0.001); // in GeV muEta2->Fill(LooseMuonsEta2); muPhi2->Fill(LooseMuonsPhi2); muPx2->Fill( muonPx2*0.001); // in GeV muPy2->Fill( muonPy2*0.001); // in GeV muPz2->Fill( muonPz2*0.001); // in GeV muE2->Fill(muonEnergy2*0.001); // in GeV zPt->Fill( zPerp*0.001); // in GeV zPx->Fill( zCompX*0.001); // in GeV zPy->Fill( zCompY*0.001); // in GeV zPz->Fill( zLongi*0.001); // in GeV zE->Fill( zEnergy*0.001); // in GeV zMass->Fill(zM*0.001); // in GeV } // draw the eta distribution zMass->Draw(); // make a ROOT output file to store your histograms TFile *outFile = new TFile(\"histograms-z.root\", \"recreate\"); muPt1->Write(); muEta1->Write(); muPhi1->Write(); muE1->Write(); muPx1->Write(); muPy1->Write(); muPz1->Write(); muPt2->Write(); muEta2->Write(); muPhi2->Write(); muE2->Write(); muPx2->Write(); muPy2->Write(); muPz2->Write(); zPt->Write(); zE->Write(); zPx->Write(); zPy->Write(); zPz->Write(); zMass->Write(); outFile->Close(); } The grid job can be submitted using: $ condor_submit run-z.cmd It can again be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-z.log , and output file: root-z.out , and the files containing the simulated data: histograms-z.root in your test directory. You again need to copy that file into your public directory, so that you can download it to your local desktop: $ cp histograms-z.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms-z.root You can inspect the contents of histograms-z.root by running Root (i.e., root histograms-z.root ) in your current directory in your local terminal window: $ root histograms-z.root And then using the Root command: TBrowser b With the TBrowser you can plot the variables in the root file. Double click on histograms-z.root , and then on the variables to plot them. Step 3: Make TSelector Now let's go back to the files created in step 1, in the remote terminal window. Start root in your test directory with the following commands: $ module load root $ root -b And then execute the following commands: TFile f(\"t00.root\"); t0->MakeSelector(\"s0\",\"=legacy\"); f.Close(); .q This will create files s0.C and s0.h in your test directory that contain code corresponding to the definition of the TTree t0 . This code can be used to process files containing data is these TTree's. Now we will add a histogram to the TSelector code. Several code lines have to be added to the TSelector code files s0.C and s0.h . To s0.h make the following additions: after existing include statements add: #include <TH1F.h> After class s0 definition: class s0 : public TSelector { public : add TH1F *e; To s0.C make the following additions: After entry: void s0::SlaveBegin(TTree * /*tree*/) { add e = new TH1F(\"e\", \"e\", 1000, -199.0, 1200.0); After Process entry: Bool_t s0::Process(Long64_t entry) { add GetEntry(entry); e->Fill(Energy); After terminate entry: void s0::Terminate() { add TFile f(\"histograms.root\",\"RECREATE\"); f.WriteObject(e,\"Energy\"); f.Close(); Now create the new script files for Step 2: create run-root-2.cmd : universe=vanilla executable=run-root-2.sh transfer_input_files = s0.C,s0.h,run-root-2.C,t00.root,t01.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root-2.log transfer_output_files = root-2.out,histograms.root output=run-root-2.out.$(Cluster).$(Process) error=run-root-2.err.$(Cluster).$(Process) notification=Never queue Create run-root-2.sh : #!/bin/bash source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load root module load libXpm root -b < run-root-2.C > root-2.out It has to be made executable, by use of the chmod Linux command: chmod +x run-root-2.sh Create run-root-2.C .L s0.C++ { //Load and run TSelector s0 *s = new s0(); TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Process(s); } We can test the Root job on the osgconnect training machine by issuing command: root -b < run-root-2.C If this works, we can process the data files t00.root and t01.root on the Grid with our new command script run-root-2.cmd . This can be done with command: condor_submit run-root-2.cmd Once your job has finished, you again need to copy that file into your public directory, so that you can download it to your local desktop: cp histograms.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms.root You can look at the output histogram file: histograms.root with TBrowser b as before, in your local terminal window.","title":"ATLAS Analysis Example"},{"location":"ASP2018/AnalysisExample/#atlas-analysis-example","text":"","title":"ATLAS Analysis Example"},{"location":"ASP2018/AnalysisExample/#introduction","text":"Root may be run in batch mode on the grid to analyze large data samples. This example creates simulated data in root format using trees and performs analysis on the simulated data by means of processing on the grid. This example is based on a demo developed by OU programmer Chris Walker.","title":"Introduction"},{"location":"ASP2018/AnalysisExample/#prerequisite","text":"Login on submission node $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net Make a directory for this exercise $ mkdir -p analysis_example $ cd analysis_example Again the $ sign at the beginning of the commands to execute is the command prompt , so it should not be entered as part of the command.","title":"Prerequisite"},{"location":"ASP2018/AnalysisExample/#simple-analysis-example","text":"","title":"Simple Analysis Example"},{"location":"ASP2018/AnalysisExample/#step-1-create-simulated-data-using-the-grid","text":"Now in your test directory on the submission host we will create the three files: run-root.cmd , run-root.sh , and run-root.C with the contents given below. This may require running an editor such as emacs on your local desktop and then copying the created files to the submission host. Or the nano editor can be run directly on the submission host. A typical copy command would be as follows. $ scp run-root.* YOUR_USER_ID@user-training.osgconnect.net:analysis_example/ It is probably easier to create all scripts with nano on the submission node, though, and then you won't have to copy ( scp ) anything at all. So everything below assumes you are logged on to a terminal session on the submission node. First, we will utilize a simple command script to submit the grid jobs. It is run-root.cmd : universe=vanilla executable=run-root.sh transfer_input_files = run-root.C transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root.log transfer_output_files = root.out,t00.root,t01.root output=run-root.out.$(Cluster).$(Process) error=run-root.err.$(Cluster).$(Process) notification=Never queue Note that the executable script is: run-root.sh which is as follows: #!/bin/bash source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load root module load libXpm root -b < run-root.C > root.out This script runs Root in batch mode and executes input macro run-root.C and produces output that is routed to file root.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-root.sh The macro run-root.C consists of the following code: { // create files containing simulated data TRandom g; char c[256]; for ( int j = 0 ; j < 2 ; j++ ){ sprintf(c,\"t%2.2d.root\\000\",j); TFile f(c,\"RECREATE\",\"MyFile\", 0/*no compression*/); TTree *t = new TTree(\"t0\",\"t0\"); Int_t Run; TBranch * b_Run = t->Branch(\"Run\",&Run); Int_t Event; TBranch * b_Event = t->Branch(\"Event\",&Event); Float_t Energy; TBranch * b_Energy = t->Branch(\"Energy\",&Energy); Run = j; for( Event = 0 ; Event < 100 ; Event++ ){ Energy = g.Gaus(500.0 , 200.0); t->Fill(); } f.Write(); f.Close(); } } .q The grid job can be submitted using: $ condor_submit run-root.cmd It can be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-root.log , and output file: root.out , and the files containing the simulated data: t00.root , t01.root in your test directory. You need to copy these files into your public directory, so that you can download it to your local desktop: $ cp t0*.root ~/public/ Now open a different terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/t00.root http://stash.osgconnect.net/~YOUR_USER_ID/t01.root You can then inspect the contents of t00.root and t01.root by running Root in your current directory in the local terminal window in which you just ran the wget command: $ root t00.root And then the Root command: TBrowser b With the TBrowser you can plot the simulated data in branch Energy as well as the other branches. Double click on the name of the root files, and then on the variables you would like to plot. Each data file contains a TTree named t0 . You can plot the contents of all (in this example both) data file TTree's by using the TChain method as follows: In Root execute the following commands: TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Draw(\"Energy\"); When you are done with this, you can quit root again with the command .q <Return> .","title":"Step 1: Create simulated data using the grid"},{"location":"ASP2018/AnalysisExample/#step-2-analyze-real-data","text":"Now we want to have a look at a real live ATLAS root file. For this, go back to the remote terminal window on osgconnect. You will need a new condor submit script called run-z.cmd : universe=vanilla executable=run-z.sh transfer_input_files = readEvents.C,/home/pskubic/public/muons.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-z.log transfer_output_files = root-z.out,histograms-z.root output=run-z.out.$(Cluster).$(Process) error=run-z.err.$(Cluster).$(Process) notification=Never queue The new executable script you need for this job is: run-z.sh which is as follows: #!/bin/bash source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load root module load libXpm root -b -q readEvents.C+ > root-z.out This script runs Root in batch mode and executes input macro readEvents.C and produces output that is routed to file root-z.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-z.sh The macro readEvents.C consists of the following code: #include \"TFile.h\" #include \"TTree.h\" #include \"TCanvas.h\" #include \"TH1F.h\" #include \"iostream\" //#include \"TLorentzVector.h\" using namespace std; void readEvents(){ // load the ROOT ntuple file TFile * f = new TFile(\"muons.root\"); TTree *tree = (TTree *) f->Get(\"POOLCollectionTree\"); int nEntries = tree->GetEntries(); cout << \"There are \" << nEntries << \" entries in your ntuple\" << endl; // create local variables for the tree's branches UInt_t NLooseMuons; Float_t LooseMuonsEta1; Float_t LooseMuonsPhi1; Float_t LooseMuonsPt1; Float_t LooseMuonsEta2; Float_t LooseMuonsPhi2; Float_t LooseMuonsPt2; // set the tree's branches to the local variables tree->SetBranchAddress(\"NLooseMuon\", &NLooseMuons); tree->SetBranchAddress(\"LooseMuonEta1\", &LooseMuonsEta1); tree->SetBranchAddress(\"LooseMuonPhi1\", &LooseMuonsPhi1); tree->SetBranchAddress(\"LooseMuonPt1\", &LooseMuonsPt1); tree->SetBranchAddress(\"LooseMuonEta2\", &LooseMuonsEta2); tree->SetBranchAddress(\"LooseMuonPhi2\", &LooseMuonsPhi2); tree->SetBranchAddress(\"LooseMuonPt2\", &LooseMuonsPt2); // declare some histograms TH1F *muPt1 = new TH1F(\"muPt1\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx1 = new TH1F(\"muPx1\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy1 = new TH1F(\"muPy1\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz1 = new TH1F(\"muPz1\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta1 = new TH1F(\"muEta1\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi1 = new TH1F(\"muPhi1\", \";#phi;Events\", 50, -4, 4); TH1F *muE1 = new TH1F(\"muE1\", \";Energy;Events\", 50, 0, 200); TH1F *muPt2 = new TH1F(\"muPt2\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx2 = new TH1F(\"muPx2\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy2 = new TH1F(\"muPy2\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz2 = new TH1F(\"muPz2\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta2 = new TH1F(\"muEta2\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi2 = new TH1F(\"muPhi2\", \";#phi;Events\", 50, -4, 4); TH1F *muE2 = new TH1F(\"muE2\", \";Energy;Events\", 50, 0, 200); TH1F *zPt = new TH1F(\"zPt\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *zPx = new TH1F(\"zPx\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *zPy = new TH1F(\"zPy\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *zPz = new TH1F(\"zPz\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz //TH1F *zEta = new TH1F(\"zEta\", \";#eta;Events\", 50, -3, 3); //TH1F *zPhi = new TH1F(\"zPhi\", \";#phi;Events\", 50, -4, 4); TH1F *zE = new TH1F(\"zE\", \";Energy;Events\", 50, 0, 200); TH1F *zMass = new TH1F(\"zMass\", \";Mass;Events\", 50, 0, 200); // loop over each entry (event) in the tree for( int entry=0; entry < nEntries; entry++ ){ if( entry%10000 * 0 ) cout << \"Entry:\" << entry << endl; // check that the event is read properly int entryCheck = tree->GetEntry( entry ); if( entryCheck <= 0 ){ continue; } // only look at events containing at least 2 leptons if(NLooseMuons < 2) continue; // require the leptons to have some transverse momentum if(abs(LooseMuonsPt1) *0.001 < 20 || abs(LooseMuonsPt2) *0.001 < 20 ) continue; // make a LorentzVector from the muon //TLorentzVector Muons1; // Muons1.SetPtEtaPhiM(fabs(LooseMuonsPt1), LooseMuonsEta1, LooseMuonsPhi1, 0); // print out the details of an electron every so often if( entry%10000 * 0 ){ cout << \"Muons pt1: \" << LooseMuonsPt1 << \" eta: \" << LooseMuonsEta1 << \" phi \" << LooseMuonsPhi1 << endl; cout << \"Muons pt2: \" << LooseMuonsPt2 << \" eta: \" << LooseMuonsEta2 << \" phi \" << LooseMuonsPhi2 << endl; } //calculation of muon energy Double_t muonMass = 0.0; // assume the mass of the muon is negligible Double_t muonPx1 = abs(LooseMuonsPt1)*cos(LooseMuonsPhi1); Double_t muonPy1 = abs(LooseMuonsPt1)*sin(LooseMuonsPhi1); Double_t muonPz1 = abs(LooseMuonsPt1)*sinh(LooseMuonsEta1); Double_t muonEnergy1 = sqrt (muonPx1*muonPx1 + muonPy1*muonPy1 + muonPz1*muonPz1 + muonMass*muonMass); Double_t muonPx2 = abs(LooseMuonsPt2)*cos(LooseMuonsPhi2); Double_t muonPy2 = abs(LooseMuonsPt2)*sin(LooseMuonsPhi2); Double_t muonPz2 = abs(LooseMuonsPt2)*sinh(LooseMuonsEta2); Double_t muonEnergy2 = sqrt (muonPx2*muonPx2 + muonPy2*muonPy2 + muonPz2*muonPz2 + muonMass*muonMass); Double_t zCompX = muonPx1 + muonPx2; Double_t zCompY = muonPy1 + muonPy2; Double_t zLongi = muonPz1 + muonPz2; Double_t zPerp = sqrt (zCompX*zCompX + zCompY*zCompY); Double_t zEnergy = muonEnergy1 + muonEnergy2; Double_t zM = sqrt (zEnergy*zEnergy -zCompX*zCompX -zCompY*zCompY -zLongi*zLongi); // fill our histograms muPt1->Fill((LooseMuonsPt1)*0.001); // in GeV muEta1->Fill(LooseMuonsEta1); muPhi1->Fill(LooseMuonsPhi1); muPx1->Fill( muonPx1*0.001); // in GeV muPy1->Fill( muonPy1*0.001); // in GeV muPz1->Fill( muonPz1*0.001); // in GeV muE1->Fill(muonEnergy1*0.001); // in GeV muPt2->Fill((LooseMuonsPt2)*0.001); // in GeV muEta2->Fill(LooseMuonsEta2); muPhi2->Fill(LooseMuonsPhi2); muPx2->Fill( muonPx2*0.001); // in GeV muPy2->Fill( muonPy2*0.001); // in GeV muPz2->Fill( muonPz2*0.001); // in GeV muE2->Fill(muonEnergy2*0.001); // in GeV zPt->Fill( zPerp*0.001); // in GeV zPx->Fill( zCompX*0.001); // in GeV zPy->Fill( zCompY*0.001); // in GeV zPz->Fill( zLongi*0.001); // in GeV zE->Fill( zEnergy*0.001); // in GeV zMass->Fill(zM*0.001); // in GeV } // draw the eta distribution zMass->Draw(); // make a ROOT output file to store your histograms TFile *outFile = new TFile(\"histograms-z.root\", \"recreate\"); muPt1->Write(); muEta1->Write(); muPhi1->Write(); muE1->Write(); muPx1->Write(); muPy1->Write(); muPz1->Write(); muPt2->Write(); muEta2->Write(); muPhi2->Write(); muE2->Write(); muPx2->Write(); muPy2->Write(); muPz2->Write(); zPt->Write(); zE->Write(); zPx->Write(); zPy->Write(); zPz->Write(); zMass->Write(); outFile->Close(); } The grid job can be submitted using: $ condor_submit run-z.cmd It can again be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-z.log , and output file: root-z.out , and the files containing the simulated data: histograms-z.root in your test directory. You again need to copy that file into your public directory, so that you can download it to your local desktop: $ cp histograms-z.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms-z.root You can inspect the contents of histograms-z.root by running Root (i.e., root histograms-z.root ) in your current directory in your local terminal window: $ root histograms-z.root And then using the Root command: TBrowser b With the TBrowser you can plot the variables in the root file. Double click on histograms-z.root , and then on the variables to plot them.","title":"Step 2: Analyze Real Data"},{"location":"ASP2018/AnalysisExample/#step-3-make-tselector","text":"Now let's go back to the files created in step 1, in the remote terminal window. Start root in your test directory with the following commands: $ module load root $ root -b And then execute the following commands: TFile f(\"t00.root\"); t0->MakeSelector(\"s0\",\"=legacy\"); f.Close(); .q This will create files s0.C and s0.h in your test directory that contain code corresponding to the definition of the TTree t0 . This code can be used to process files containing data is these TTree's. Now we will add a histogram to the TSelector code. Several code lines have to be added to the TSelector code files s0.C and s0.h . To s0.h make the following additions: after existing include statements add: #include <TH1F.h> After class s0 definition: class s0 : public TSelector { public : add TH1F *e; To s0.C make the following additions: After entry: void s0::SlaveBegin(TTree * /*tree*/) { add e = new TH1F(\"e\", \"e\", 1000, -199.0, 1200.0); After Process entry: Bool_t s0::Process(Long64_t entry) { add GetEntry(entry); e->Fill(Energy); After terminate entry: void s0::Terminate() { add TFile f(\"histograms.root\",\"RECREATE\"); f.WriteObject(e,\"Energy\"); f.Close(); Now create the new script files for Step 2: create run-root-2.cmd : universe=vanilla executable=run-root-2.sh transfer_input_files = s0.C,s0.h,run-root-2.C,t00.root,t01.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root-2.log transfer_output_files = root-2.out,histograms.root output=run-root-2.out.$(Cluster).$(Process) error=run-root-2.err.$(Cluster).$(Process) notification=Never queue Create run-root-2.sh : #!/bin/bash source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load root module load libXpm root -b < run-root-2.C > root-2.out It has to be made executable, by use of the chmod Linux command: chmod +x run-root-2.sh Create run-root-2.C .L s0.C++ { //Load and run TSelector s0 *s = new s0(); TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Process(s); } We can test the Root job on the osgconnect training machine by issuing command: root -b < run-root-2.C If this works, we can process the data files t00.root and t01.root on the Grid with our new command script run-root-2.cmd . This can be done with command: condor_submit run-root-2.cmd Once your job has finished, you again need to copy that file into your public directory, so that you can download it to your local desktop: cp histograms.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms.root You can look at the output histogram file: histograms.root with TBrowser b as before, in your local terminal window.","title":"Step 3: Make TSelector"},{"location":"ASP2018/PH_DT2017/","text":"Polar Hackathon 2017 High-Level Cirriculum Overview Wednesday Session 1 Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Wednesday Session 2 Workflows and distributed environments Thursday Sessions Containers Jetstream demo and discussion Materials Materials Page","title":"Polar Hackathon 2017"},{"location":"ASP2018/PH_DT2017/#polar-hackathon-2017","text":"","title":"Polar Hackathon 2017"},{"location":"ASP2018/PH_DT2017/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"ASP2018/PH_DT2017/#wednesday-session-1","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Wednesday Session 1"},{"location":"ASP2018/PH_DT2017/#wednesday-session-2","text":"Workflows and distributed environments","title":"Wednesday Session 2"},{"location":"ASP2018/PH_DT2017/#thursday-sessions","text":"Containers Jetstream demo and discussion","title":"Thursday Sessions"},{"location":"ASP2018/PH_DT2017/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2018/PH_Materials/","text":"Polar Hackathon School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Wednesday - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Wednesday - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday - Computational Infrastructures - Session 3 Lecture 4 - Slides Containers with Singularity Jetstream Demo Possible if Time - Workflows Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow Extras Discipline specific tutorials Contact information Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Polar Hackathon School Materials"},{"location":"ASP2018/PH_Materials/#polar-hackathon-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Polar Hackathon School Materials"},{"location":"ASP2018/PH_Materials/#wednesday-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Wednesday - Computational Infrastructures - Session 1"},{"location":"ASP2018/PH_Materials/#wednesday-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Wednesday - Computational Infrastructures - Session 2"},{"location":"ASP2018/PH_Materials/#thursday-computational-infrastructures-session-3","text":"Lecture 4 - Slides Containers with Singularity Jetstream Demo","title":"Thursday - Computational Infrastructures - Session 3"},{"location":"ASP2018/PH_Materials/#possible-if-time-workflows","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow","title":"Possible if Time - Workflows"},{"location":"ASP2018/PH_Materials/#extras","text":"Discipline specific tutorials","title":"Extras"},{"location":"ASP2018/PH_Materials/#contact-information","text":"Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"ASP2018/School/","text":"Data Trieste 2017 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Friday AM Session Introduction to cloud environemts Close out and resources for further collaboration Detailed Schedule Thursday 20-July 2017 Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick Friday 21-July 2017 Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch Materials Materials Page","title":"Data Trieste 2017"},{"location":"ASP2018/School/#data-trieste-2017","text":"","title":"Data Trieste 2017"},{"location":"ASP2018/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"ASP2018/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"ASP2018/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Thursday PM Session"},{"location":"ASP2018/School/#friday-am-session","text":"Introduction to cloud environemts Close out and resources for further collaboration","title":"Friday AM Session"},{"location":"ASP2018/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"ASP2018/School/#thursday-20-july-2017","text":"Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick","title":"Thursday 20-July 2017"},{"location":"ASP2018/School/#friday-21-july-2017","text":"Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch","title":"Friday 21-July 2017"},{"location":"ASP2018/School/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2022/","text":"Placeholder for ASP 2018 Lessons","title":"Index"},{"location":"ASP2022/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will need your Gmail or GitHub credentials for this session. You might want to refer to the online Condor manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 9.12.0, which is a recent version of Condor. Condor has two coexisting types of releases at any given time: Feature (development) and Long Term Support (stable). Condor 9.12.0 is considered a stable release, while 9.13.1 is considered a development release. You can know 9.12.0 is stable because the second digit (a 12 in this case) is an even number, while in the development version 9.13.1 it is an odd number (13 in this case). In a given stable series, all versions have the same features (for example 9.10.16 and 9.10.17 have the same set of features) and differ only in bug fixes. Where you will work Today you will log into https://notebook.ospool.osg-htc.org/hub/login for all of your exercises: Login on submission node using a web browser: https://notebook.ospool.osg-htc.org/hub/login Click on 'Sign in with CILogon'. Select the Identity Provider Google (or GitHub). Click 'Log On'. Log into your Google account (or GitHub). Click the 'Server Option' 'DOSAR/ASP December 2022'. Click 'Start'. This will take some time. In the 'Launcher' window, click on 'Terminal' (bottom left). When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into a terminal in the ospool notebook above.","title":"High Throughput Computing and Condor Introduction"},{"location":"ASP2022/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"ASP2022/01-Introduction/#preliminaries","text":"You will need your Gmail or GitHub credentials for this session. You might want to refer to the online Condor manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"ASP2022/01-Introduction/#which-condor","text":"We will be using Condor 9.12.0, which is a recent version of Condor. Condor has two coexisting types of releases at any given time: Feature (development) and Long Term Support (stable). Condor 9.12.0 is considered a stable release, while 9.13.1 is considered a development release. You can know 9.12.0 is stable because the second digit (a 12 in this case) is an even number, while in the development version 9.13.1 it is an odd number (13 in this case). In a given stable series, all versions have the same features (for example 9.10.16 and 9.10.17 have the same set of features) and differ only in bug fixes.","title":"Which Condor?"},{"location":"ASP2022/01-Introduction/#where-you-will-work","text":"Today you will log into https://notebook.ospool.osg-htc.org/hub/login for all of your exercises: Login on submission node using a web browser: https://notebook.ospool.osg-htc.org/hub/login Click on 'Sign in with CILogon'. Select the Identity Provider Google (or GitHub). Click 'Log On'. Log into your Google account (or GitHub). Click the 'Server Option' 'DOSAR/ASP December 2022'. Click 'Start'. This will take some time. In the 'Launcher' window, click on 'Terminal' (bottom left). When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"ASP2022/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into a terminal in the ospool notebook above.","title":"The Exercises"},{"location":"ASP2022/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into https://notebook.ospool.osg-htc.org/hub/login with your Gmail or GitHub account. $ hostname jupyter-...-40gmail-2ecom Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 9.12.0 2022-10-05 BuildID: 608474 PackageID: 9.12.0-1.1 $ $CondorPlatform: X86_64-Ubuntu_20.04 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not necessarily the computer we're running on. It was built on Ubuntu 20.04, but you might notice that we're running on Ubuntu 20.04.5, which is a slightly newer version. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Ubuntu 20.04.5 LTS \\n \\l Or you can run: $ lsb_release -a o No LSB modules are available. Distributor ID: Ubuntu Description: Ubuntu 20.04.5 LTS Release: 20.04 Codename: focal Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor dead.letter execute/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_collector. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:26:20 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q (Will not work on this ospool training machine.) condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"ASP2022/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"ASP2022/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"ASP2022/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into https://notebook.ospool.osg-htc.org/hub/login with your Gmail or GitHub account. $ hostname jupyter-...-40gmail-2ecom","title":"Login to the Condor submit computer"},{"location":"ASP2022/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 9.12.0 2022-10-05 BuildID: 608474 PackageID: 9.12.0-1.1 $ $CondorPlatform: X86_64-Ubuntu_20.04 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not necessarily the computer we're running on. It was built on Ubuntu 20.04, but you might notice that we're running on Ubuntu 20.04.5, which is a slightly newer version.","title":"Looking at our Condor installation"},{"location":"ASP2022/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Ubuntu 20.04.5 LTS \\n \\l Or you can run: $ lsb_release -a o No LSB modules are available. Distributor ID: Ubuntu Description: Ubuntu 20.04.5 LTS Release: 20.04 Codename: focal Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor dead.letter execute/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_collector. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"ASP2022/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:26:20 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"ASP2022/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q (Will not work on this ospool training machine.) condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"ASP2022/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"ASP2022/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"ASP2022/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"ASP2022/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. Let's create a directory to perform the condor tests. Execute the following two commands in your Linux shell: $ mkdir -p ~/condor-test $ cd ~/condor-test Next, create a file called simple.c using your favorite editor. Use your preferred text editor to create this C program. If you don't have one, use 'nano', as shown below. The file will be in the condor-test directory. In that file, put the following text. Copy and paste is a good choice: $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } To copy and paste code into nano on ospool, it is convenient to use key strokes: CTRL-c (windows), command-c (mac) to copy; and CTRL-v (windows), command-v (mac) to paste. To save your file and exit nano, type: CTRL-x, y, return. Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 2056. Now, watch your job run (insert your username in the command below instead of YOUR_USER_ID . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (2056.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (2056.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (2056.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (2056.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 34. $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q YOUR_USERID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"ASP2022/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"ASP2022/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"ASP2022/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. Let's create a directory to perform the condor tests. Execute the following two commands in your Linux shell: $ mkdir -p ~/condor-test $ cd ~/condor-test Next, create a file called simple.c using your favorite editor. Use your preferred text editor to create this C program. If you don't have one, use 'nano', as shown below. The file will be in the condor-test directory. In that file, put the following text. Copy and paste is a good choice: $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } To copy and paste code into nano on ospool, it is convenient to use key strokes: CTRL-c (windows), command-c (mac) to copy; and CTRL-v (windows), command-v (mac) to paste. To save your file and exit nano, type: CTRL-x, y, return. Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"ASP2022/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 2056. Now, watch your job run (insert your username in the command below instead of YOUR_USER_ID . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (2056.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (2056.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (2056.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (2056.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"ASP2022/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 34. $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q YOUR_USERID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"ASP2022/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"ASP2022/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"ASP2022/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"ASP2022/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"ASP2022/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"ASP2022/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 23.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 23.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 23.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 23.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 23.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 23.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 23.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 23.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history YOUR_USER_ID For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"ASP2022/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"ASP2022/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"ASP2022/04-TipsandTricks/#tips-for-condor_q","text":"Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 23.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 23.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 23.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 23.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 23.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 23.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 23.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 23.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"ASP2022/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"ASP2022/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history YOUR_USER_ID For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"ASP2022/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you would like to a write program in the Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 You will need to make this shell script executable with chmod +x simple.sh , and then you can run it with ./simple.sh 3 4 . Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"ASP2022/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"ASP2022/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you would like to a write program in the Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 You will need to make this shell script executable with chmod +x simple.sh , and then you can run it with ./simple.sh 3 4 . Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"ASP2022/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"ASP2022/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask us. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Invoke R with the proper environment /opt/conda/bin/R --slave --vanilla < $1 You could easily execute this on OSpool by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) queue","title":"Running a job with R"},{"location":"ASP2022/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"ASP2022/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"ASP2022/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"ASP2022/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"ASP2022/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"ASP2022/06-RJob/#setup","text":"You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask us. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Invoke R with the proper environment /opt/conda/bin/R --slave --vanilla < $1 You could easily execute this on OSpool by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"ASP2022/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"ASP2022/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) queue","title":"The answer"},{"location":"ASP2022/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"ASP2022/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"ASP2022/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"ASP2022/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"ASP2022/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"ASP2022/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot . Downloading the needed executables Since your training VMs don't have the goatbrot executable needed for this exercise, we will need to download it first. Execute the following commands to do this: $ mkdir bin $ cd bin $ wget https://www.nhn.ou.edu/~hs/tmp/goatbrot $ chmod +x goatbrot $ cd .. A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif We need to download the GIF file from your training VM to your local desktop. To do so, find the file mandle.gif in the list of files and directories in the side bar to the left of your terminal window. Right-click on it and select Download, and download it to your local desktop. Point Browser at the file URL: open a Terminal on your local CentOS VM, and then type in there: firefox Downloads/mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run each goatbroat in parallel in our cluster. Here's an example you can run by hand. (This is back in your first terminal, where you are logged into the osgconnect machine.) Run goatbroat 4 times : $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Find the file mandle.gif in your side bar again. 2. Download it and display it in Firefox.","title":"A brief detour through the Mandlebrot set"},{"location":"ASP2022/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot .","title":"A brief detour through the Mandlebrot set"},{"location":"ASP2022/08-Mandlebrot/#downloading-the-needed-executables","text":"Since your training VMs don't have the goatbrot executable needed for this exercise, we will need to download it first. Execute the following commands to do this: $ mkdir bin $ cd bin $ wget https://www.nhn.ou.edu/~hs/tmp/goatbrot $ chmod +x goatbrot $ cd ..","title":"Downloading the needed executables"},{"location":"ASP2022/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif We need to download the GIF file from your training VM to your local desktop. To do so, find the file mandle.gif in the list of files and directories in the side bar to the left of your terminal window. Right-click on it and select Download, and download it to your local desktop. Point Browser at the file URL: open a Terminal on your local CentOS VM, and then type in there: firefox Downloads/mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"ASP2022/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run each goatbroat in parallel in our cluster. Here's an example you can run by hand. (This is back in your first terminal, where you are logged into the osgconnect machine.) Run goatbroat 4 times : $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"ASP2022/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Find the file mandle.gif in your side bar again. 2. Download it and display it in Firefox.","title":"Try it!"},{"location":"ASP2022/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file (called submit ) that has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. Make sure you are in the condor-test directory that you used before. cd ~/condor-test . If the directory condor-test was deleted, you will have to create it again and create and compile the simple program again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. Click the + sign just to the right of the title 'Terminal 1' -- NOT the x! -- in order to open additional terminals. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . JOB simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue. As before, replace YOUR_USER_ID with your username. (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q YOUR_USER_ID -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here? Challenge What is the scheduler universe? Why does DAGMan use it?","title":"Coordinating set of jobs: A simple DAG"},{"location":"ASP2022/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"ASP2022/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"ASP2022/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"ASP2022/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file (called submit ) that has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. Make sure you are in the condor-test directory that you used before. cd ~/condor-test . If the directory condor-test was deleted, you will have to create it again and create and compile the simple program again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. Click the + sign just to the right of the title 'Terminal 1' -- NOT the x! -- in order to open additional terminals. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . JOB simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue. As before, replace YOUR_USER_ID with your username. (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q YOUR_USER_ID -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"ASP2022/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"ASP2022/09-SimpleDAG/#challenge","text":"What is the scheduler universe? Why does DAGMan use it?","title":"Challenge"},{"location":"ASP2022/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. You have placed the goatbrot executable in your bin directory: ~/bin/goatbrot . Condor does not deal well with ~/ as the home directory, so we will use the full path /home/jovyan/bin/ instead in the submit scripts, which goes to the same directory. goatbrot1.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on the VM. universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch -n 1 condor_q YOUR_USER_ID -nobatch To quit watch command, press Ctrl-c . Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can download it again from the left side bar, and then display it with Firefox. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"ASP2022/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"ASP2022/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"ASP2022/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. You have placed the goatbrot executable in your bin directory: ~/bin/goatbrot . Condor does not deal well with ~/ as the home directory, so we will use the full path /home/jovyan/bin/ instead in the submit scripts, which goes to the same directory.","title":"Make your job submission files"},{"location":"ASP2022/10-ComplexDAG/#goatbrot1sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"ASP2022/10-ComplexDAG/#goatbrot2sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"ASP2022/10-ComplexDAG/#goatbrot3sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"ASP2022/10-ComplexDAG/#goatbrot4sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"ASP2022/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on the VM. universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue","title":"montage.sub"},{"location":"ASP2022/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"ASP2022/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"ASP2022/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch -n 1 condor_q YOUR_USER_ID -nobatch To quit watch command, press Ctrl-c . Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can download it again from the left side bar, and then display it with Firefox. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"ASP2022/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"ASP2022/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this: universe = vanilla executable = /usr/bin/montage arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the rescue DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Handling a DAG that fails"},{"location":"ASP2022/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"ASP2022/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this: universe = vanilla executable = /usr/bin/montage arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the rescue DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"ASP2022/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"ASP2022/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"ASP2022/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"ASP2022/12-VariableSubstitution/#declare-your-variables","text":"First declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"ASP2022/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"ASP2022/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"ASP2022/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"ASP2022/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"ASP2022/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"ASP2022/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"ASP2022/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"ASP2022/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"ASP2022/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"ASP2022/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"ASP2022/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"ASP2022/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"ASP2022/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"ASP2022/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"ASP2022/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"ASP2022/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"ASP2022/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"ASP2022/ASP2022_Materials/","text":"ASP 2022 Materials We will be using OSG Training Pool for this set of sessions. Please visit https://notebook.ospool.osg-htc.org/hub/login to log in. Wednesday Morning - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Intro-Slides Lecture1-Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Wednesday Afternoon - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Wednesday Afternoon - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Thursday Morning - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Thursday Morning - Computational Infrastructures - Session 5 ATLAS Analysis Example Lecture ATLAS Analysis Example Exercise Computational Infrastructures Wrap Up - Slides Contact information Rob Quick - rquick@iu.edu Horst Severini - severini@ou.edu Patrick Skubic - pskubic@ou.edu Julia Gray - julia.ann.gray@gmail.com Jae Yu - jaehoonyu@uta.edu Chris Walker - walker@nhn.ou.edu If you want long\u2010term grid access, you can go to http://www.osgconnect.net/ and sign up","title":"ASP 2022 Materials"},{"location":"ASP2022/ASP2022_Materials/#asp-2022-materials","text":"We will be using OSG Training Pool for this set of sessions. Please visit https://notebook.ospool.osg-htc.org/hub/login to log in.","title":"ASP 2022 Materials"},{"location":"ASP2022/ASP2022_Materials/#wednesday-morning-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Intro-Slides Lecture1-Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Wednesday Morning - Computational Infrastructures - Session 1"},{"location":"ASP2022/ASP2022_Materials/#wednesday-afternoon-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Wednesday Afternoon - Computational Infrastructures - Session 2"},{"location":"ASP2022/ASP2022_Materials/#wednesday-afternoon-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Wednesday Afternoon - Computational Infrastructures - Session 3"},{"location":"ASP2022/ASP2022_Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"ASP2022/ASP2022_Materials/#thursday-morning-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Thursday Morning - Computational Infrastructures - Session 4"},{"location":"ASP2022/ASP2022_Materials/#thursday-morning-computational-infrastructures-session-5","text":"ATLAS Analysis Example Lecture ATLAS Analysis Example Exercise Computational Infrastructures Wrap Up - Slides","title":"Thursday Morning - Computational Infrastructures - Session 5"},{"location":"ASP2022/ASP2022_Materials/#contact-information","text":"Rob Quick - rquick@iu.edu Horst Severini - severini@ou.edu Patrick Skubic - pskubic@ou.edu Julia Gray - julia.ann.gray@gmail.com Jae Yu - jaehoonyu@uta.edu Chris Walker - walker@nhn.ou.edu If you want long\u2010term grid access, you can go to http://www.osgconnect.net/ and sign up","title":"Contact information"},{"location":"ASP2022/ASP2022_Schedule/","text":"ASP 2022 High-Level Curriculum Overview Wednesday AM Session Welcome and Background The Landscape of Research Computing Wednesday PM Session Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Thursday AM Session Brief Introduction to clouds and containers ATLAS Analysis Example Close out and resources for further collaboration Detailed Schedule Wednesday 7-December 2022 Time Description Instructor 11:00 Welcome and the Landscape of Research Computing Jae Yu 11:45 Exercise - UNIX Refresher, Running simple Condor jobs All 12:30 Lunch 14:00 Profiling your application and finding a home for your workflow Pat Skubic 14:45 Exercise - Running scripting and R jobs with Condor All 15:30 Coffee Break 16:00 Workflows and distributed environments Horst Severini 16:45 Exercise - DAGMan All Thursday 8-December 2022 Time Description Instructor 09:00 A Brief Introduction to Clouds and Containers Horst Severini 09:45 Exercise - Using Containers on OSG and Discpline Specfic Tutorials All 10:30 Coffee Break 11:00 ATLAS Analysis Example and Exercises Pat Skubic 11:45 Closing Thoughts All 12:30 Lunch Materials Materials Page","title":"ASP 2022 Schedule"},{"location":"ASP2022/ASP2022_Schedule/#asp-2022","text":"","title":"ASP 2022"},{"location":"ASP2022/ASP2022_Schedule/#high-level-curriculum-overview","text":"","title":"High-Level Curriculum Overview"},{"location":"ASP2022/ASP2022_Schedule/#wednesday-am-session","text":"Welcome and Background The Landscape of Research Computing","title":"Wednesday AM Session"},{"location":"ASP2022/ASP2022_Schedule/#wednesday-pm-session","text":"Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Wednesday PM Session"},{"location":"ASP2022/ASP2022_Schedule/#thursday-am-session","text":"Brief Introduction to clouds and containers ATLAS Analysis Example Close out and resources for further collaboration","title":"Thursday AM Session"},{"location":"ASP2022/ASP2022_Schedule/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"ASP2022/ASP2022_Schedule/#wednesday-7-december-2022","text":"Time Description Instructor 11:00 Welcome and the Landscape of Research Computing Jae Yu 11:45 Exercise - UNIX Refresher, Running simple Condor jobs All 12:30 Lunch 14:00 Profiling your application and finding a home for your workflow Pat Skubic 14:45 Exercise - Running scripting and R jobs with Condor All 15:30 Coffee Break 16:00 Workflows and distributed environments Horst Severini 16:45 Exercise - DAGMan All","title":"Wednesday 7-December 2022"},{"location":"ASP2022/ASP2022_Schedule/#thursday-8-december-2022","text":"Time Description Instructor 09:00 A Brief Introduction to Clouds and Containers Horst Severini 09:45 Exercise - Using Containers on OSG and Discpline Specfic Tutorials All 10:30 Coffee Break 11:00 ATLAS Analysis Example and Exercises Pat Skubic 11:45 Closing Thoughts All 12:30 Lunch","title":"Thursday 8-December 2022"},{"location":"ASP2022/ASP2022_Schedule/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2022/AnalysisExample/","text":"ATLAS Analysis Example Introduction Root may be run in batch mode on the grid to analyze large data samples. This example creates simulated data in root format using trees and performs analysis on the simulated data by means of processing on the grid. This example is based on a demo developed by OU programmer Chris Walker. Prerequisite Open a new Terminal on your local desktop. NOTE: You are no longer using the browser based terminal now, but the Terminal on your CentOS VM, just like you did to display mandle.gif with firefox. Make a directory for this exercise $ mkdir -p analysis_example $ cd analysis_example Again the $ sign at the beginning of the commands to execute is the command prompt , so it should not be entered as part of the command. Simple Analysis Example Step 1: Create simulated data using the grid Note: Since the new training VMs on OSpool do not support running root, we will run root on the local desktops instead of using condor. So we will not need the condor submit scripts below but we will leave the instructions for them for future reference. Now in your test directory we will create the three files: run-root.cmd , run-root.sh , and run-root.C with the contents given below. This may require running an editor such as emacs or nano on your local desktop. We will not submit grid jobs so the \"run-root.cmd\" script is not needed for this exercise. ----------------------------- Skip from here-1 ----------------------------------------- First, we will utilize a simple command script to submit the grid jobs. It is run-root.cmd : universe=vanilla executable=run-root.sh transfer_input_files = run-root.C transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root.log transfer_output_files = root.out,t00.root,t01.root output=run-root.out.$(Cluster).$(Process) error=run-root.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-1 ---------------------------------------------------- Note that the executable script is: run-root.sh which is as follows: #!/bin/bash root -b < run-root.C > root.out This script runs Root in batch mode and executes input macro run-root.C and produces output that is routed to file root.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-root.sh The macro run-root.C consists of the following code: { // create files containing simulated data TRandom g; char c[256]; for ( int j = 0 ; j < 2 ; j++ ){ sprintf(c,\"t%2.2d.root\\000\",j); TFile f(c,\"RECREATE\",\"MyFile\", 0/*no compression*/); TTree *t = new TTree(\"t0\",\"t0\"); Int_t Run; TBranch * b_Run = t->Branch(\"Run\",&Run); Int_t Event; TBranch * b_Event = t->Branch(\"Event\",&Event); Float_t Energy; TBranch * b_Energy = t->Branch(\"Energy\",&Energy); Run = j; for( Event = 0 ; Event < 100 ; Event++ ){ Energy = g.Gaus(500.0 , 200.0); t->Fill(); } f.Write(); f.Close(); } } .q We will not submit grid jobs during this exercise. So we will skip to running root. ----------------------------- Skip from here-2 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-root.cmd It can be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-root.log , and output file: root.out , and the files containing the simulated data: t00.root , t01.root in your test directory. You need to copy these files into your public directory, so that you can download it to your local desktop: $ cp t0*.root ~/public/ Now open a different terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/t00.root http://stash.osgconnect.net/~YOUR_USER_ID/t01.root ----------------------------------------------- Skip to here-2 ---------------------------------------------------- Execute the script to run root: ./run-root.sh You can then inspect the contents of t00.root and t01.root by running root in your current directory in the local terminal window: $ root t00.root And then the Root command: TBrowser b With the TBrowser you can plot the simulated data in branch Energy as well as the other branches. Double click on the name of the root files, and then on the variables you would like to plot. Each data file contains a TTree named t0 . You can plot the contents of all (in this example both) data file TTree's by using the TChain method as follows: In root execute the following commands: TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Draw(\"Energy\"); When you are done with this, you can quit root again with the command .q <Return> . Step 2: Analyze Real Data We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-3 ----------------------------------------- The grid job can be submitted using: Now we want to have a look at a real live ATLAS root file. For this, go back to the remote terminal window on osgconnect. You will need a new condor submit script called run-z.cmd : universe=vanilla executable=run-z.sh transfer_input_files = readEvents.C,/home/pskubic/public/muons.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-z.log transfer_output_files = root-z.out,histograms-z.root output=run-z.out.$(Cluster).$(Process) error=run-z.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-3 ---------------------------------------------------- The new executable script you need for this job is: run-z.sh which is as follows: #!/bin/bash root -b -q readEvents.C+ > root-z.out This script runs root in batch mode and executes input macro readEvents.C and produces output that is routed to file root-z.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-z.sh The macro readEvents.C consists of the following code: #include \"TFile.h\" #include \"TTree.h\" #include \"TCanvas.h\" #include \"TH1F.h\" #include \"iostream\" //#include \"TLorentzVector.h\" using namespace std; void readEvents(){ // load the ROOT ntuple file TFile * f = new TFile(\"muons.root\"); TTree *tree = (TTree *) f->Get(\"POOLCollectionTree\"); int nEntries = tree->GetEntries(); cout << \"There are \" << nEntries << \" entries in your ntuple\" << endl; // create local variables for the tree's branches UInt_t NLooseMuons; Float_t LooseMuonsEta1; Float_t LooseMuonsPhi1; Float_t LooseMuonsPt1; Float_t LooseMuonsEta2; Float_t LooseMuonsPhi2; Float_t LooseMuonsPt2; // set the tree's branches to the local variables tree->SetBranchAddress(\"NLooseMuon\", &NLooseMuons); tree->SetBranchAddress(\"LooseMuonEta1\", &LooseMuonsEta1); tree->SetBranchAddress(\"LooseMuonPhi1\", &LooseMuonsPhi1); tree->SetBranchAddress(\"LooseMuonPt1\", &LooseMuonsPt1); tree->SetBranchAddress(\"LooseMuonEta2\", &LooseMuonsEta2); tree->SetBranchAddress(\"LooseMuonPhi2\", &LooseMuonsPhi2); tree->SetBranchAddress(\"LooseMuonPt2\", &LooseMuonsPt2); // declare some histograms TH1F *muPt1 = new TH1F(\"muPt1\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx1 = new TH1F(\"muPx1\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy1 = new TH1F(\"muPy1\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz1 = new TH1F(\"muPz1\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta1 = new TH1F(\"muEta1\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi1 = new TH1F(\"muPhi1\", \";#phi;Events\", 50, -4, 4); TH1F *muE1 = new TH1F(\"muE1\", \";Energy;Events\", 50, 0, 200); TH1F *muPt2 = new TH1F(\"muPt2\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx2 = new TH1F(\"muPx2\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy2 = new TH1F(\"muPy2\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz2 = new TH1F(\"muPz2\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta2 = new TH1F(\"muEta2\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi2 = new TH1F(\"muPhi2\", \";#phi;Events\", 50, -4, 4); TH1F *muE2 = new TH1F(\"muE2\", \";Energy;Events\", 50, 0, 200); TH1F *zPt = new TH1F(\"zPt\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *zPx = new TH1F(\"zPx\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *zPy = new TH1F(\"zPy\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *zPz = new TH1F(\"zPz\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz //TH1F *zEta = new TH1F(\"zEta\", \";#eta;Events\", 50, -3, 3); //TH1F *zPhi = new TH1F(\"zPhi\", \";#phi;Events\", 50, -4, 4); TH1F *zE = new TH1F(\"zE\", \";Energy;Events\", 50, 0, 200); TH1F *zMass = new TH1F(\"zMass\", \";Mass;Events\", 50, 0, 200); // loop over each entry (event) in the tree for( int entry=0; entry < nEntries; entry++ ){ if( entry%10000 * 0 ) cout << \"Entry:\" << entry << endl; // check that the event is read properly int entryCheck = tree->GetEntry( entry ); if( entryCheck <= 0 ){ continue; } // only look at events containing at least 2 leptons if(NLooseMuons < 2) continue; // require the leptons to have some transverse momentum if(abs(LooseMuonsPt1) *0.001 < 20 || abs(LooseMuonsPt2) *0.001 < 20 ) continue; // make a LorentzVector from the muon //TLorentzVector Muons1; // Muons1.SetPtEtaPhiM(fabs(LooseMuonsPt1), LooseMuonsEta1, LooseMuonsPhi1, 0); // print out the details of an electron every so often if( entry%10000 * 0 ){ cout << \"Muons pt1: \" << LooseMuonsPt1 << \" eta: \" << LooseMuonsEta1 << \" phi \" << LooseMuonsPhi1 << endl; cout << \"Muons pt2: \" << LooseMuonsPt2 << \" eta: \" << LooseMuonsEta2 << \" phi \" << LooseMuonsPhi2 << endl; } //calculation of muon energy Double_t muonMass = 0.0; // assume the mass of the muon is negligible Double_t muonPx1 = abs(LooseMuonsPt1)*cos(LooseMuonsPhi1); Double_t muonPy1 = abs(LooseMuonsPt1)*sin(LooseMuonsPhi1); Double_t muonPz1 = abs(LooseMuonsPt1)*sinh(LooseMuonsEta1); Double_t muonEnergy1 = sqrt (muonPx1*muonPx1 + muonPy1*muonPy1 + muonPz1*muonPz1 + muonMass*muonMass); Double_t muonPx2 = abs(LooseMuonsPt2)*cos(LooseMuonsPhi2); Double_t muonPy2 = abs(LooseMuonsPt2)*sin(LooseMuonsPhi2); Double_t muonPz2 = abs(LooseMuonsPt2)*sinh(LooseMuonsEta2); Double_t muonEnergy2 = sqrt (muonPx2*muonPx2 + muonPy2*muonPy2 + muonPz2*muonPz2 + muonMass*muonMass); Double_t zCompX = muonPx1 + muonPx2; Double_t zCompY = muonPy1 + muonPy2; Double_t zLongi = muonPz1 + muonPz2; Double_t zPerp = sqrt (zCompX*zCompX + zCompY*zCompY); Double_t zEnergy = muonEnergy1 + muonEnergy2; Double_t zM = sqrt (zEnergy*zEnergy -zCompX*zCompX -zCompY*zCompY -zLongi*zLongi); // fill our histograms muPt1->Fill((LooseMuonsPt1)*0.001); // in GeV muEta1->Fill(LooseMuonsEta1); muPhi1->Fill(LooseMuonsPhi1); muPx1->Fill( muonPx1*0.001); // in GeV muPy1->Fill( muonPy1*0.001); // in GeV muPz1->Fill( muonPz1*0.001); // in GeV muE1->Fill(muonEnergy1*0.001); // in GeV muPt2->Fill((LooseMuonsPt2)*0.001); // in GeV muEta2->Fill(LooseMuonsEta2); muPhi2->Fill(LooseMuonsPhi2); muPx2->Fill( muonPx2*0.001); // in GeV muPy2->Fill( muonPy2*0.001); // in GeV muPz2->Fill( muonPz2*0.001); // in GeV muE2->Fill(muonEnergy2*0.001); // in GeV zPt->Fill( zPerp*0.001); // in GeV zPx->Fill( zCompX*0.001); // in GeV zPy->Fill( zCompY*0.001); // in GeV zPz->Fill( zLongi*0.001); // in GeV zE->Fill( zEnergy*0.001); // in GeV zMass->Fill(zM*0.001); // in GeV } // draw the eta distribution zMass->Draw(); // make a ROOT output file to store your histograms TFile *outFile = new TFile(\"histograms-z.root\", \"recreate\"); muPt1->Write(); muEta1->Write(); muPhi1->Write(); muE1->Write(); muPx1->Write(); muPy1->Write(); muPz1->Write(); muPt2->Write(); muEta2->Write(); muPhi2->Write(); muE2->Write(); muPx2->Write(); muPy2->Write(); muPz2->Write(); zPt->Write(); zE->Write(); zPx->Write(); zPy->Write(); zPz->Write(); zMass->Write(); outFile->Close(); } We will not submit grid jobs during this exercise. So we will skip to run root. ----------------------------- Skip from here-4 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-z.cmd It can again be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-z.log , and output file: root-z.out , and the files containing the simulated data: histograms-z.root in your test directory. You again need to copy that file into your public directory, so that you can download it to your local desktop: $ cp histograms-z.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms-z.root ----------------------------------------------- Skip to here-4 ---------------------------------------------------- Setup a soft link to the input data file, muons.root, and execute the script to run root: ln -s /opt/data/muons.root . ./run-z.sh You can inspect the contents of histograms-z.root by running Root (i.e., root histograms-z.root ) in your current directory in your local terminal window: $ root histograms-z.root And then using the Root command: TBrowser b With the TBrowser you can plot the variables in the root file. Double click on histograms-z.root , and then on the variables to plot them. Step 3: Make TSelector Now let's go back to the files created in step 1, in the local terminal window. Start root in your test directory with the following commands: $ root -b And then execute the following commands: TFile f(\"t00.root\"); t0->MakeSelector(\"s0\",\"=legacy\"); f.Close(); .q This will create files s0.C and s0.h in your test directory that contain code corresponding to the definition of the TTree t0 . This code can be used to process files containing data in these TTree's. Now we will add a histogram to the TSelector code. Several code lines have to be added to the TSelector code files s0.C and s0.h . To s0.h make the following additions: after existing include statements add: #include <TH1F.h> After class s0 definition: class s0 : public TSelector { public : add TH1F *e; To s0.C make the following additions: After entry: void s0::SlaveBegin(TTree * /*tree*/) { add e = new TH1F(\"e\", \"e\", 1000, -199.0, 1200.0); After Process entry: Bool_t s0::Process(Long64_t entry) { add GetEntry(entry); e->Fill(Energy); After terminate entry: void s0::Terminate() { add TFile f(\"histograms.root\",\"RECREATE\"); f.WriteObject(e,\"Energy\"); f.Close(); We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-5 ----------------------------------------- Now create the new script files for Step 2: create run-root-2.cmd : universe=vanilla executable=run-root-2.sh transfer_input_files = s0.C,s0.h,run-root-2.C,t00.root,t01.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root-2.log transfer_output_files = root-2.out,histograms.root output=run-root-2.out.$(Cluster).$(Process) error=run-root-2.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-5 ---------------------------------------------------- Create run-root-2.sh : #!/bin/bash root -b < run-root-2.C > root-2.out It has to be made executable, by use of the chmod Linux command: chmod +x run-root-2.sh Create run-root-2.C .L s0.C++ { //Load and run TSelector s0 *s = new s0(); TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Process(s); } We can test the root job on the local machine by executing the script to run root: ./run-root-2.sh We will not submit grid jobs during this exercise. So we will skip running condor. ----------------------------- Skip from here-6 ----------------------------------------- If this works, we can process the data files t00.root and t01.root on the Grid with our new command script run-root-2.cmd . This can be done with command: condor_submit run-root-2.cmd Once your job has finished, you again need to copy that file into your public directory, so that you can download it to your local desktop: cp histograms.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms.root ----------------------------------------------- Skip to here-6 ---------------------------------------------------- You can look at the output histogram file: histograms.root with TBrowser b as before, in your local terminal window.","title":"ATLAS Analysis Example"},{"location":"ASP2022/AnalysisExample/#atlas-analysis-example","text":"","title":"ATLAS Analysis Example"},{"location":"ASP2022/AnalysisExample/#introduction","text":"Root may be run in batch mode on the grid to analyze large data samples. This example creates simulated data in root format using trees and performs analysis on the simulated data by means of processing on the grid. This example is based on a demo developed by OU programmer Chris Walker.","title":"Introduction"},{"location":"ASP2022/AnalysisExample/#prerequisite","text":"Open a new Terminal on your local desktop. NOTE: You are no longer using the browser based terminal now, but the Terminal on your CentOS VM, just like you did to display mandle.gif with firefox. Make a directory for this exercise $ mkdir -p analysis_example $ cd analysis_example Again the $ sign at the beginning of the commands to execute is the command prompt , so it should not be entered as part of the command.","title":"Prerequisite"},{"location":"ASP2022/AnalysisExample/#simple-analysis-example","text":"","title":"Simple Analysis Example"},{"location":"ASP2022/AnalysisExample/#step-1-create-simulated-data-using-the-grid","text":"Note: Since the new training VMs on OSpool do not support running root, we will run root on the local desktops instead of using condor. So we will not need the condor submit scripts below but we will leave the instructions for them for future reference. Now in your test directory we will create the three files: run-root.cmd , run-root.sh , and run-root.C with the contents given below. This may require running an editor such as emacs or nano on your local desktop. We will not submit grid jobs so the \"run-root.cmd\" script is not needed for this exercise. ----------------------------- Skip from here-1 ----------------------------------------- First, we will utilize a simple command script to submit the grid jobs. It is run-root.cmd : universe=vanilla executable=run-root.sh transfer_input_files = run-root.C transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root.log transfer_output_files = root.out,t00.root,t01.root output=run-root.out.$(Cluster).$(Process) error=run-root.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-1 ---------------------------------------------------- Note that the executable script is: run-root.sh which is as follows: #!/bin/bash root -b < run-root.C > root.out This script runs Root in batch mode and executes input macro run-root.C and produces output that is routed to file root.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-root.sh The macro run-root.C consists of the following code: { // create files containing simulated data TRandom g; char c[256]; for ( int j = 0 ; j < 2 ; j++ ){ sprintf(c,\"t%2.2d.root\\000\",j); TFile f(c,\"RECREATE\",\"MyFile\", 0/*no compression*/); TTree *t = new TTree(\"t0\",\"t0\"); Int_t Run; TBranch * b_Run = t->Branch(\"Run\",&Run); Int_t Event; TBranch * b_Event = t->Branch(\"Event\",&Event); Float_t Energy; TBranch * b_Energy = t->Branch(\"Energy\",&Energy); Run = j; for( Event = 0 ; Event < 100 ; Event++ ){ Energy = g.Gaus(500.0 , 200.0); t->Fill(); } f.Write(); f.Close(); } } .q We will not submit grid jobs during this exercise. So we will skip to running root. ----------------------------- Skip from here-2 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-root.cmd It can be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-root.log , and output file: root.out , and the files containing the simulated data: t00.root , t01.root in your test directory. You need to copy these files into your public directory, so that you can download it to your local desktop: $ cp t0*.root ~/public/ Now open a different terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/t00.root http://stash.osgconnect.net/~YOUR_USER_ID/t01.root ----------------------------------------------- Skip to here-2 ---------------------------------------------------- Execute the script to run root: ./run-root.sh You can then inspect the contents of t00.root and t01.root by running root in your current directory in the local terminal window: $ root t00.root And then the Root command: TBrowser b With the TBrowser you can plot the simulated data in branch Energy as well as the other branches. Double click on the name of the root files, and then on the variables you would like to plot. Each data file contains a TTree named t0 . You can plot the contents of all (in this example both) data file TTree's by using the TChain method as follows: In root execute the following commands: TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Draw(\"Energy\"); When you are done with this, you can quit root again with the command .q <Return> .","title":"Step 1: Create simulated data using the grid"},{"location":"ASP2022/AnalysisExample/#step-2-analyze-real-data","text":"We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-3 ----------------------------------------- The grid job can be submitted using: Now we want to have a look at a real live ATLAS root file. For this, go back to the remote terminal window on osgconnect. You will need a new condor submit script called run-z.cmd : universe=vanilla executable=run-z.sh transfer_input_files = readEvents.C,/home/pskubic/public/muons.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-z.log transfer_output_files = root-z.out,histograms-z.root output=run-z.out.$(Cluster).$(Process) error=run-z.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-3 ---------------------------------------------------- The new executable script you need for this job is: run-z.sh which is as follows: #!/bin/bash root -b -q readEvents.C+ > root-z.out This script runs root in batch mode and executes input macro readEvents.C and produces output that is routed to file root-z.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-z.sh The macro readEvents.C consists of the following code: #include \"TFile.h\" #include \"TTree.h\" #include \"TCanvas.h\" #include \"TH1F.h\" #include \"iostream\" //#include \"TLorentzVector.h\" using namespace std; void readEvents(){ // load the ROOT ntuple file TFile * f = new TFile(\"muons.root\"); TTree *tree = (TTree *) f->Get(\"POOLCollectionTree\"); int nEntries = tree->GetEntries(); cout << \"There are \" << nEntries << \" entries in your ntuple\" << endl; // create local variables for the tree's branches UInt_t NLooseMuons; Float_t LooseMuonsEta1; Float_t LooseMuonsPhi1; Float_t LooseMuonsPt1; Float_t LooseMuonsEta2; Float_t LooseMuonsPhi2; Float_t LooseMuonsPt2; // set the tree's branches to the local variables tree->SetBranchAddress(\"NLooseMuon\", &NLooseMuons); tree->SetBranchAddress(\"LooseMuonEta1\", &LooseMuonsEta1); tree->SetBranchAddress(\"LooseMuonPhi1\", &LooseMuonsPhi1); tree->SetBranchAddress(\"LooseMuonPt1\", &LooseMuonsPt1); tree->SetBranchAddress(\"LooseMuonEta2\", &LooseMuonsEta2); tree->SetBranchAddress(\"LooseMuonPhi2\", &LooseMuonsPhi2); tree->SetBranchAddress(\"LooseMuonPt2\", &LooseMuonsPt2); // declare some histograms TH1F *muPt1 = new TH1F(\"muPt1\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx1 = new TH1F(\"muPx1\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy1 = new TH1F(\"muPy1\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz1 = new TH1F(\"muPz1\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta1 = new TH1F(\"muEta1\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi1 = new TH1F(\"muPhi1\", \";#phi;Events\", 50, -4, 4); TH1F *muE1 = new TH1F(\"muE1\", \";Energy;Events\", 50, 0, 200); TH1F *muPt2 = new TH1F(\"muPt2\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx2 = new TH1F(\"muPx2\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy2 = new TH1F(\"muPy2\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz2 = new TH1F(\"muPz2\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta2 = new TH1F(\"muEta2\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi2 = new TH1F(\"muPhi2\", \";#phi;Events\", 50, -4, 4); TH1F *muE2 = new TH1F(\"muE2\", \";Energy;Events\", 50, 0, 200); TH1F *zPt = new TH1F(\"zPt\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *zPx = new TH1F(\"zPx\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *zPy = new TH1F(\"zPy\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *zPz = new TH1F(\"zPz\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz //TH1F *zEta = new TH1F(\"zEta\", \";#eta;Events\", 50, -3, 3); //TH1F *zPhi = new TH1F(\"zPhi\", \";#phi;Events\", 50, -4, 4); TH1F *zE = new TH1F(\"zE\", \";Energy;Events\", 50, 0, 200); TH1F *zMass = new TH1F(\"zMass\", \";Mass;Events\", 50, 0, 200); // loop over each entry (event) in the tree for( int entry=0; entry < nEntries; entry++ ){ if( entry%10000 * 0 ) cout << \"Entry:\" << entry << endl; // check that the event is read properly int entryCheck = tree->GetEntry( entry ); if( entryCheck <= 0 ){ continue; } // only look at events containing at least 2 leptons if(NLooseMuons < 2) continue; // require the leptons to have some transverse momentum if(abs(LooseMuonsPt1) *0.001 < 20 || abs(LooseMuonsPt2) *0.001 < 20 ) continue; // make a LorentzVector from the muon //TLorentzVector Muons1; // Muons1.SetPtEtaPhiM(fabs(LooseMuonsPt1), LooseMuonsEta1, LooseMuonsPhi1, 0); // print out the details of an electron every so often if( entry%10000 * 0 ){ cout << \"Muons pt1: \" << LooseMuonsPt1 << \" eta: \" << LooseMuonsEta1 << \" phi \" << LooseMuonsPhi1 << endl; cout << \"Muons pt2: \" << LooseMuonsPt2 << \" eta: \" << LooseMuonsEta2 << \" phi \" << LooseMuonsPhi2 << endl; } //calculation of muon energy Double_t muonMass = 0.0; // assume the mass of the muon is negligible Double_t muonPx1 = abs(LooseMuonsPt1)*cos(LooseMuonsPhi1); Double_t muonPy1 = abs(LooseMuonsPt1)*sin(LooseMuonsPhi1); Double_t muonPz1 = abs(LooseMuonsPt1)*sinh(LooseMuonsEta1); Double_t muonEnergy1 = sqrt (muonPx1*muonPx1 + muonPy1*muonPy1 + muonPz1*muonPz1 + muonMass*muonMass); Double_t muonPx2 = abs(LooseMuonsPt2)*cos(LooseMuonsPhi2); Double_t muonPy2 = abs(LooseMuonsPt2)*sin(LooseMuonsPhi2); Double_t muonPz2 = abs(LooseMuonsPt2)*sinh(LooseMuonsEta2); Double_t muonEnergy2 = sqrt (muonPx2*muonPx2 + muonPy2*muonPy2 + muonPz2*muonPz2 + muonMass*muonMass); Double_t zCompX = muonPx1 + muonPx2; Double_t zCompY = muonPy1 + muonPy2; Double_t zLongi = muonPz1 + muonPz2; Double_t zPerp = sqrt (zCompX*zCompX + zCompY*zCompY); Double_t zEnergy = muonEnergy1 + muonEnergy2; Double_t zM = sqrt (zEnergy*zEnergy -zCompX*zCompX -zCompY*zCompY -zLongi*zLongi); // fill our histograms muPt1->Fill((LooseMuonsPt1)*0.001); // in GeV muEta1->Fill(LooseMuonsEta1); muPhi1->Fill(LooseMuonsPhi1); muPx1->Fill( muonPx1*0.001); // in GeV muPy1->Fill( muonPy1*0.001); // in GeV muPz1->Fill( muonPz1*0.001); // in GeV muE1->Fill(muonEnergy1*0.001); // in GeV muPt2->Fill((LooseMuonsPt2)*0.001); // in GeV muEta2->Fill(LooseMuonsEta2); muPhi2->Fill(LooseMuonsPhi2); muPx2->Fill( muonPx2*0.001); // in GeV muPy2->Fill( muonPy2*0.001); // in GeV muPz2->Fill( muonPz2*0.001); // in GeV muE2->Fill(muonEnergy2*0.001); // in GeV zPt->Fill( zPerp*0.001); // in GeV zPx->Fill( zCompX*0.001); // in GeV zPy->Fill( zCompY*0.001); // in GeV zPz->Fill( zLongi*0.001); // in GeV zE->Fill( zEnergy*0.001); // in GeV zMass->Fill(zM*0.001); // in GeV } // draw the eta distribution zMass->Draw(); // make a ROOT output file to store your histograms TFile *outFile = new TFile(\"histograms-z.root\", \"recreate\"); muPt1->Write(); muEta1->Write(); muPhi1->Write(); muE1->Write(); muPx1->Write(); muPy1->Write(); muPz1->Write(); muPt2->Write(); muEta2->Write(); muPhi2->Write(); muE2->Write(); muPx2->Write(); muPy2->Write(); muPz2->Write(); zPt->Write(); zE->Write(); zPx->Write(); zPy->Write(); zPz->Write(); zMass->Write(); outFile->Close(); } We will not submit grid jobs during this exercise. So we will skip to run root. ----------------------------- Skip from here-4 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-z.cmd It can again be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-z.log , and output file: root-z.out , and the files containing the simulated data: histograms-z.root in your test directory. You again need to copy that file into your public directory, so that you can download it to your local desktop: $ cp histograms-z.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms-z.root ----------------------------------------------- Skip to here-4 ---------------------------------------------------- Setup a soft link to the input data file, muons.root, and execute the script to run root: ln -s /opt/data/muons.root . ./run-z.sh You can inspect the contents of histograms-z.root by running Root (i.e., root histograms-z.root ) in your current directory in your local terminal window: $ root histograms-z.root And then using the Root command: TBrowser b With the TBrowser you can plot the variables in the root file. Double click on histograms-z.root , and then on the variables to plot them.","title":"Step 2: Analyze Real Data"},{"location":"ASP2022/AnalysisExample/#step-3-make-tselector","text":"Now let's go back to the files created in step 1, in the local terminal window. Start root in your test directory with the following commands: $ root -b And then execute the following commands: TFile f(\"t00.root\"); t0->MakeSelector(\"s0\",\"=legacy\"); f.Close(); .q This will create files s0.C and s0.h in your test directory that contain code corresponding to the definition of the TTree t0 . This code can be used to process files containing data in these TTree's. Now we will add a histogram to the TSelector code. Several code lines have to be added to the TSelector code files s0.C and s0.h . To s0.h make the following additions: after existing include statements add: #include <TH1F.h> After class s0 definition: class s0 : public TSelector { public : add TH1F *e; To s0.C make the following additions: After entry: void s0::SlaveBegin(TTree * /*tree*/) { add e = new TH1F(\"e\", \"e\", 1000, -199.0, 1200.0); After Process entry: Bool_t s0::Process(Long64_t entry) { add GetEntry(entry); e->Fill(Energy); After terminate entry: void s0::Terminate() { add TFile f(\"histograms.root\",\"RECREATE\"); f.WriteObject(e,\"Energy\"); f.Close(); We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-5 ----------------------------------------- Now create the new script files for Step 2: create run-root-2.cmd : universe=vanilla executable=run-root-2.sh transfer_input_files = s0.C,s0.h,run-root-2.C,t00.root,t01.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root-2.log transfer_output_files = root-2.out,histograms.root output=run-root-2.out.$(Cluster).$(Process) error=run-root-2.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-5 ---------------------------------------------------- Create run-root-2.sh : #!/bin/bash root -b < run-root-2.C > root-2.out It has to be made executable, by use of the chmod Linux command: chmod +x run-root-2.sh Create run-root-2.C .L s0.C++ { //Load and run TSelector s0 *s = new s0(); TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Process(s); } We can test the root job on the local machine by executing the script to run root: ./run-root-2.sh We will not submit grid jobs during this exercise. So we will skip running condor. ----------------------------- Skip from here-6 ----------------------------------------- If this works, we can process the data files t00.root and t01.root on the Grid with our new command script run-root-2.cmd . This can be done with command: condor_submit run-root-2.cmd Once your job has finished, you again need to copy that file into your public directory, so that you can download it to your local desktop: cp histograms.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms.root ----------------------------------------------- Skip to here-6 ---------------------------------------------------- You can look at the output histogram file: histograms.root with TBrowser b as before, in your local terminal window.","title":"Step 3: Make TSelector"},{"location":"ASP2022/PH_DT2017/","text":"Polar Hackathon 2017 High-Level Cirriculum Overview Wednesday Session 1 Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Wednesday Session 2 Workflows and distributed environments Thursday Sessions Containers Jetstream demo and discussion Materials Materials Page","title":"Polar Hackathon 2017"},{"location":"ASP2022/PH_DT2017/#polar-hackathon-2017","text":"","title":"Polar Hackathon 2017"},{"location":"ASP2022/PH_DT2017/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"ASP2022/PH_DT2017/#wednesday-session-1","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Wednesday Session 1"},{"location":"ASP2022/PH_DT2017/#wednesday-session-2","text":"Workflows and distributed environments","title":"Wednesday Session 2"},{"location":"ASP2022/PH_DT2017/#thursday-sessions","text":"Containers Jetstream demo and discussion","title":"Thursday Sessions"},{"location":"ASP2022/PH_DT2017/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2022/PH_Materials/","text":"Polar Hackathon School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Wednesday - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Wednesday - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday - Computational Infrastructures - Session 3 Lecture 4 - Slides Containers with Singularity Jetstream Demo Possible if Time - Workflows Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow Extras Discipline specific tutorials Contact information Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Polar Hackathon School Materials"},{"location":"ASP2022/PH_Materials/#polar-hackathon-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Polar Hackathon School Materials"},{"location":"ASP2022/PH_Materials/#wednesday-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Wednesday - Computational Infrastructures - Session 1"},{"location":"ASP2022/PH_Materials/#wednesday-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Wednesday - Computational Infrastructures - Session 2"},{"location":"ASP2022/PH_Materials/#thursday-computational-infrastructures-session-3","text":"Lecture 4 - Slides Containers with Singularity Jetstream Demo","title":"Thursday - Computational Infrastructures - Session 3"},{"location":"ASP2022/PH_Materials/#possible-if-time-workflows","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow","title":"Possible if Time - Workflows"},{"location":"ASP2022/PH_Materials/#extras","text":"Discipline specific tutorials","title":"Extras"},{"location":"ASP2022/PH_Materials/#contact-information","text":"Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"ASP2022/School/","text":"Data Trieste 2017 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Friday AM Session Introduction to cloud environemts Close out and resources for further collaboration Detailed Schedule Thursday 20-July 2017 Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick Friday 21-July 2017 Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch Materials Materials Page","title":"Data Trieste 2017"},{"location":"ASP2022/School/#data-trieste-2017","text":"","title":"Data Trieste 2017"},{"location":"ASP2022/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"ASP2022/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"ASP2022/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Thursday PM Session"},{"location":"ASP2022/School/#friday-am-session","text":"Introduction to cloud environemts Close out and resources for further collaboration","title":"Friday AM Session"},{"location":"ASP2022/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"ASP2022/School/#thursday-20-july-2017","text":"Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick","title":"Thursday 20-July 2017"},{"location":"ASP2022/School/#friday-21-july-2017","text":"Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch","title":"Friday 21-July 2017"},{"location":"ASP2022/School/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2024/","text":"Placeholder for ASP 2024 Lessons","title":"Index"},{"location":"ASP2024/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will need your Gmail or GitHub credentials for this session. You might want to refer to the online Condor manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 23.7.2, which is a recent version of Condor. Where you will work Today you will log into https://notebook.ospool.osg-htc.org/hub/login for all of your exercises: Login on submission node using a web browser: https://notebook.ospool.osg-htc.org/hub/login Click on 'Sign in with CILogon'. Select the Identity Provider Google (or GitHub). Click 'Log On'. Log into your Google account (or GitHub). Click the 'Server Option' 'Data Science'. Click 'Start'. This will take some time. In the 'Launcher' window, click on 'Terminal' (bottom left). When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into a terminal in the ospool notebook above.","title":"High Throughput Computing and Condor Introduction"},{"location":"ASP2024/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"ASP2024/01-Introduction/#preliminaries","text":"You will need your Gmail or GitHub credentials for this session. You might want to refer to the online Condor manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"ASP2024/01-Introduction/#which-condor","text":"We will be using Condor 23.7.2, which is a recent version of Condor.","title":"Which Condor?"},{"location":"ASP2024/01-Introduction/#where-you-will-work","text":"Today you will log into https://notebook.ospool.osg-htc.org/hub/login for all of your exercises: Login on submission node using a web browser: https://notebook.ospool.osg-htc.org/hub/login Click on 'Sign in with CILogon'. Select the Identity Provider Google (or GitHub). Click 'Log On'. Log into your Google account (or GitHub). Click the 'Server Option' 'Data Science'. Click 'Start'. This will take some time. In the 'Launcher' window, click on 'Terminal' (bottom left). When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"ASP2024/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into a terminal in the ospool notebook above.","title":"The Exercises"},{"location":"ASP2024/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into https://notebook.ospool.osg-htc.org/hub/login with your Gmail or GitHub account. $ hostname jupyter-...-40gmail-2ecom Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 23.7.2 2024-05-16 BuildID: 733409 PackageID: 23.7.2-0.2 GitSHA: 585ec167 $ $CondorPlatform: X86_64-Ubuntu_22.04 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not necessarily the computer we're running on. It was built on Ubuntu 22.04, but you might notice that we're running on Ubuntu 22.04.4, which is a slightly newer version. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Ubuntu 22.04.4 LTS \\n \\l Or you can run: $ lsb_release -a o No LSB modules are available. Distributor ID: Ubuntu Description: Ubuntu 22.04.4 LTS Release: 22.04 Codename: jammy Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_config.local config.d ganglia.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /home/jovyan/.condor/local # at: /etc/condor/condor_config.local, line 2 # raw: LOCAL_DIR = $ENV(HOME)/.condor/local $ s -CF /home/jovyan/.condor/local/ cred_dir/ execute/ lock/ log/ run/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep jovyan 17 0.0 0.0 23844 7240 ? Ss 19:32 0:00 condor_master jovyan 18 0.0 0.0 7620 2372 ? S 19:32 0:00 \\_ condor_procd -A /home/jovyan/.condor/local/run/procd_pipe -L /home/jovyan/ jovyan 19 0.0 0.0 18200 8284 ? Ss 19:32 0:00 \\_ condor_shared_port jovyan 20 0.0 0.0 20180 9640 ? Ss 19:32 0:00 \\_ condor_collector jovyan 21 0.0 0.0 20688 10028 ? Ss 19:32 0:00 \\_ condor_negotiator jovyan 22 0.0 0.0 21320 10104 ? Ss 19:32 0:00 \\_ condor_schedd jovyan 23 0.0 0.0 21136 10172 ? Ss 19:32 0:00 \\_ condor_startd For this version of Condor there are these processes running: the condor_master, the condor_schedd, the condor_procd, the condor_collector, the condor_negotiator, and condor_shared_port. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -nobatch -- Schedd: jovyan@jupyter-email-3ahorst-2eseverini-40gmail-2ecom : <127.0.0.1:9618?... @ 07/02/24 19:44:46 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for jovyan: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q (Will not work on this ospool training machine.) condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@jupyter-email-3ahorst-2eseverini-40gmail-2ecom LINUX X86_64 Unclaimed Idle 0.000 257750 0+00:14:02 Total Owner Claimed Unclaimed Matched Preempting Drain Backfill BkIdle X86_64/LINUX 1 0 0 1 0 0 0 0 0 Total 1 0 0 1 0 0 0 0 0 ... Let's look at exactly what you can see (this will look differently on different condor pools): Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"ASP2024/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"ASP2024/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"ASP2024/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into https://notebook.ospool.osg-htc.org/hub/login with your Gmail or GitHub account. $ hostname jupyter-...-40gmail-2ecom","title":"Login to the Condor submit computer"},{"location":"ASP2024/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 23.7.2 2024-05-16 BuildID: 733409 PackageID: 23.7.2-0.2 GitSHA: 585ec167 $ $CondorPlatform: X86_64-Ubuntu_22.04 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not necessarily the computer we're running on. It was built on Ubuntu 22.04, but you might notice that we're running on Ubuntu 22.04.4, which is a slightly newer version.","title":"Looking at our Condor installation"},{"location":"ASP2024/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Ubuntu 22.04.4 LTS \\n \\l Or you can run: $ lsb_release -a o No LSB modules are available. Distributor ID: Ubuntu Description: Ubuntu 22.04.4 LTS Release: 22.04 Codename: jammy Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_config.local config.d ganglia.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /home/jovyan/.condor/local # at: /etc/condor/condor_config.local, line 2 # raw: LOCAL_DIR = $ENV(HOME)/.condor/local $ s -CF /home/jovyan/.condor/local/ cred_dir/ execute/ lock/ log/ run/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep jovyan 17 0.0 0.0 23844 7240 ? Ss 19:32 0:00 condor_master jovyan 18 0.0 0.0 7620 2372 ? S 19:32 0:00 \\_ condor_procd -A /home/jovyan/.condor/local/run/procd_pipe -L /home/jovyan/ jovyan 19 0.0 0.0 18200 8284 ? Ss 19:32 0:00 \\_ condor_shared_port jovyan 20 0.0 0.0 20180 9640 ? Ss 19:32 0:00 \\_ condor_collector jovyan 21 0.0 0.0 20688 10028 ? Ss 19:32 0:00 \\_ condor_negotiator jovyan 22 0.0 0.0 21320 10104 ? Ss 19:32 0:00 \\_ condor_schedd jovyan 23 0.0 0.0 21136 10172 ? Ss 19:32 0:00 \\_ condor_startd For this version of Condor there are these processes running: the condor_master, the condor_schedd, the condor_procd, the condor_collector, the condor_negotiator, and condor_shared_port. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"ASP2024/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -nobatch -- Schedd: jovyan@jupyter-email-3ahorst-2eseverini-40gmail-2ecom : <127.0.0.1:9618?... @ 07/02/24 19:44:46 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for jovyan: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"ASP2024/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q (Will not work on this ospool training machine.) condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"ASP2024/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"ASP2024/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@jupyter-email-3ahorst-2eseverini-40gmail-2ecom LINUX X86_64 Unclaimed Idle 0.000 257750 0+00:14:02 Total Owner Claimed Unclaimed Matched Preempting Drain Backfill BkIdle X86_64/LINUX 1 0 0 1 0 0 0 0 0 Total 1 0 0 1 0 0 0 0 0 ... Let's look at exactly what you can see (this will look differently on different condor pools): Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"ASP2024/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"ASP2024/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. Let's create a directory to perform the condor tests. Execute the following two commands in your Linux shell: $ mkdir -p ~/condor-test $ cd ~/condor-test Next, create a file called simple.c using your favorite editor. Use your preferred text editor to create this C program. If you don't have one, use 'nano', as shown below. The file will be in the condor-test directory. In that file, put the following text. Copy and paste is a good choice: $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } To copy and paste code into nano on ospool, it is convenient to use key strokes: CTRL-c (windows), command-c (mac) to copy; and CTRL-v (windows), command-v (mac) to paste. To save your file and exit nano, type: CTRL-x, y, return. Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 2056. Now, watch your job run (insert your username in the command below instead of YOUR_USER_ID . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (2056.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (2056.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (2056.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (2056.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 34. $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q YOUR_USERID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"ASP2024/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"ASP2024/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"ASP2024/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. Let's create a directory to perform the condor tests. Execute the following two commands in your Linux shell: $ mkdir -p ~/condor-test $ cd ~/condor-test Next, create a file called simple.c using your favorite editor. Use your preferred text editor to create this C program. If you don't have one, use 'nano', as shown below. The file will be in the condor-test directory. In that file, put the following text. Copy and paste is a good choice: $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } To copy and paste code into nano on ospool, it is convenient to use key strokes: CTRL-c (windows), command-c (mac) to copy; and CTRL-v (windows), command-v (mac) to paste. To save your file and exit nano, type: CTRL-x, y, return. Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"ASP2024/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 2056. Now, watch your job run (insert your username in the command below instead of YOUR_USER_ID . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (2056.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (2056.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (2056.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (2056.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"ASP2024/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 34. $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q YOUR_USER_ID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q YOUR_USERID -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"ASP2024/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"ASP2024/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"ASP2024/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"ASP2024/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"ASP2024/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"ASP2024/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 23.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 23.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 23.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 23.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 23.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 23.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 23.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 23.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history YOUR_USER_ID For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"ASP2024/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"ASP2024/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"ASP2024/04-TipsandTricks/#tips-for-condor_q","text":"Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 23.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 23.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 23.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 23.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 23.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 23.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 23.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 23.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"ASP2024/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"ASP2024/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history YOUR_USER_ID For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"ASP2024/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you would like to a write program in the Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 You will need to make this shell script executable with chmod +x simple.sh , and then you can run it with ./simple.sh 3 4 . Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"ASP2024/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"ASP2024/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you would like to a write program in the Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 You will need to make this shell script executable with chmod +x simple.sh , and then you can run it with ./simple.sh 3 4 . Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"ASP2024/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"ASP2024/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask us. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Invoke R with the proper environment /opt/conda/bin/R --slave --vanilla < $1 You could easily execute this on OSpool by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) queue","title":"Running a job with R"},{"location":"ASP2024/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"ASP2024/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"ASP2024/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"ASP2024/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"ASP2024/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"ASP2024/06-RJob/#setup","text":"You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask us. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Invoke R with the proper environment /opt/conda/bin/R --slave --vanilla < $1 You could easily execute this on OSpool by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"ASP2024/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"ASP2024/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) queue","title":"The answer"},{"location":"ASP2024/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"ASP2024/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"ASP2024/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"ASP2024/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"ASP2024/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"ASP2024/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot . Downloading the needed executables Since your training VMs don't have the goatbrot executable needed for this exercise, we will need to download it first. Execute the following commands to do this: $ source /cvmfs/sft.cern.ch/lcg/views/setupViews.sh LCG_105a x86_64-ubuntu2204-gcc11-opt $ mkdir bin $ cd bin $ wget https://www.nhn.ou.edu/~hs/tmp/goatbrot $ chmod +x goatbrot $ cd .. A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif We need to download the GIF file from your training VM to your local desktop. To do so, find the file mandle.gif in the list of files and directories in the side bar to the left of your terminal window. Right-click on it and select Download, and download it to your local desktop. Point Browser at the file URL: open a Terminal on your local CentOS VM, and then type in there: firefox Downloads/mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run each goatbroat in parallel in our cluster. Here's an example you can run by hand. (This is back in your first terminal, where you are logged into the osgconnect machine.) Run goatbroat 4 times : $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Find the file mandle.gif in your side bar again. 2. Download it and display it in Firefox.","title":"A brief detour through the Mandlebrot set"},{"location":"ASP2024/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot .","title":"A brief detour through the Mandlebrot set"},{"location":"ASP2024/08-Mandlebrot/#downloading-the-needed-executables","text":"Since your training VMs don't have the goatbrot executable needed for this exercise, we will need to download it first. Execute the following commands to do this: $ source /cvmfs/sft.cern.ch/lcg/views/setupViews.sh LCG_105a x86_64-ubuntu2204-gcc11-opt $ mkdir bin $ cd bin $ wget https://www.nhn.ou.edu/~hs/tmp/goatbrot $ chmod +x goatbrot $ cd ..","title":"Downloading the needed executables"},{"location":"ASP2024/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif We need to download the GIF file from your training VM to your local desktop. To do so, find the file mandle.gif in the list of files and directories in the side bar to the left of your terminal window. Right-click on it and select Download, and download it to your local desktop. Point Browser at the file URL: open a Terminal on your local CentOS VM, and then type in there: firefox Downloads/mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"ASP2024/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run each goatbroat in parallel in our cluster. Here's an example you can run by hand. (This is back in your first terminal, where you are logged into the osgconnect machine.) Run goatbroat 4 times : $ ~/bin/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ ~/bin/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"ASP2024/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Find the file mandle.gif in your side bar again. 2. Download it and display it in Firefox.","title":"Try it!"},{"location":"ASP2024/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file (called submit ) that has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. Make sure you are in the condor-test directory that you used before. cd ~/condor-test . If the directory condor-test was deleted, you will have to create it again and create and compile the simple program again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. Click the + sign just to the right of the title 'Terminal 1' -- NOT the x! -- in order to open additional terminals. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . JOB simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue. As before, replace YOUR_USER_ID with your username. (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q YOUR_USER_ID -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here? Challenge What is the scheduler universe? Why does DAGMan use it?","title":"Coordinating set of jobs: A simple DAG"},{"location":"ASP2024/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"ASP2024/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"ASP2024/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"ASP2024/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file (called submit ) that has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. Make sure you are in the condor-test directory that you used before. cd ~/condor-test . If the directory condor-test was deleted, you will have to create it again and create and compile the simple program again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. Click the + sign just to the right of the title 'Terminal 1' -- NOT the x! -- in order to open additional terminals. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . JOB simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue. As before, replace YOUR_USER_ID with your username. (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q YOUR_USER_ID -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"ASP2024/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"ASP2024/09-SimpleDAG/#challenge","text":"What is the scheduler universe? Why does DAGMan use it?","title":"Challenge"},{"location":"ASP2024/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. You have placed the goatbrot executable in your bin directory: ~/bin/goatbrot . Condor does not deal well with ~/ as the home directory, so we will use the full path /home/jovyan/bin/ instead in the submit scripts, which goes to the same directory. goatbrot1.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on the VM. universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch -n 1 condor_q YOUR_USER_ID -nobatch To quit watch command, press Ctrl-c . Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can download it again from the left side bar, and then display it with Firefox. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"ASP2024/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"ASP2024/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"ASP2024/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. You have placed the goatbrot executable in your bin directory: ~/bin/goatbrot . Condor does not deal well with ~/ as the home directory, so we will use the full path /home/jovyan/bin/ instead in the submit scripts, which goes to the same directory.","title":"Make your job submission files"},{"location":"ASP2024/10-ComplexDAG/#goatbrot1sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"ASP2024/10-ComplexDAG/#goatbrot2sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"ASP2024/10-ComplexDAG/#goatbrot3sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"ASP2024/10-ComplexDAG/#goatbrot4sub","text":"executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"ASP2024/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on the VM. universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue","title":"montage.sub"},{"location":"ASP2024/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"ASP2024/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"ASP2024/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch -n 1 condor_q YOUR_USER_ID -nobatch To quit watch command, press Ctrl-c . Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can download it again from the left side bar, and then display it with Firefox. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"ASP2024/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"ASP2024/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this: universe = vanilla executable = /usr/bin/montage arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the rescue DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Handling a DAG that fails"},{"location":"ASP2024/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"ASP2024/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this: universe = vanilla executable = /usr/bin/montage arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the rescue DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = /usr/bin/montage arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"ASP2024/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"ASP2024/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"ASP2024/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"ASP2024/12-VariableSubstitution/#declare-your-variables","text":"First declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /home/jovyan/bin/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"ASP2024/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"ASP2024/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"ASP2024/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"ASP2024/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"ASP2024/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"ASP2024/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"ASP2024/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"ASP2024/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"ASP2024/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"ASP2024/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"ASP2024/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"ASP2024/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"ASP2024/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"ASP2024/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"ASP2024/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"ASP2024/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"ASP2024/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"ASP2024/ASP2024_Materials/","text":"ASP 2024 Materials We will be using OSG Training Pool for this set of sessions. Please visit https://notebook.ospool.osg-htc.org/hub/login to log in. Wednesday Afternoon - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Intro-Slides Lecture1-Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Wednesday Afternoon - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday Morning - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Thursday Morning - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Thursday Afternoon - Computational Infrastructures - Session 5 ATLAS Analysis Example Lecture ATLAS Analysis Example Exercise Computational Infrastructures Wrap Up - Slides Contact information Rob Quick - rquick@iu.edu Horst Severini - severini@ou.edu Patrick Skubic - pskubic@ou.edu Julia Gray - julia.ann.gray@gmail.com Jae Yu - jaehoonyu@uta.edu Chris Walker - walker@nhn.ou.edu If you want long\u2010term grid access, you can go to http://www.osgconnect.net/ and sign up","title":"ASP 2024 Materials"},{"location":"ASP2024/ASP2024_Materials/#asp-2024-materials","text":"We will be using OSG Training Pool for this set of sessions. Please visit https://notebook.ospool.osg-htc.org/hub/login to log in.","title":"ASP 2024 Materials"},{"location":"ASP2024/ASP2024_Materials/#wednesday-afternoon-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Intro-Slides Lecture1-Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Wednesday Afternoon - Computational Infrastructures - Session 1"},{"location":"ASP2024/ASP2024_Materials/#wednesday-afternoon-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Wednesday Afternoon - Computational Infrastructures - Session 2"},{"location":"ASP2024/ASP2024_Materials/#thursday-morning-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Thursday Morning - Computational Infrastructures - Session 3"},{"location":"ASP2024/ASP2024_Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"ASP2024/ASP2024_Materials/#thursday-morning-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Thursday Morning - Computational Infrastructures - Session 4"},{"location":"ASP2024/ASP2024_Materials/#thursday-afternoon-computational-infrastructures-session-5","text":"ATLAS Analysis Example Lecture ATLAS Analysis Example Exercise Computational Infrastructures Wrap Up - Slides","title":"Thursday Afternoon - Computational Infrastructures - Session 5"},{"location":"ASP2024/ASP2024_Materials/#contact-information","text":"Rob Quick - rquick@iu.edu Horst Severini - severini@ou.edu Patrick Skubic - pskubic@ou.edu Julia Gray - julia.ann.gray@gmail.com Jae Yu - jaehoonyu@uta.edu Chris Walker - walker@nhn.ou.edu If you want long\u2010term grid access, you can go to http://www.osgconnect.net/ and sign up","title":"Contact information"},{"location":"ASP2024/ASP2024_Schedule/","text":"ASP 2024 High-Level Curriculum Overview Wednesday PM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday AM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Brief Introduction to clouds and containers Thursday PM Session ATLAS Analysis Example Close out and resources for further collaboration Detailed Schedule Wednesday 17-July 2024 Time Description Instructor 14:30 Welcome and the Landscape of Research Computing Jae Yu 15:15 Exercise - UNIX Refresher, Running simple Condor jobs All 16:00 Coffee Break 16:30 Profiling your application and finding a home for your workflow Pat Skubic 17:15 Exercise - Running scripting and R jobs with Condor All Thursday 18-July 2024 Time Description Instructor 09:30 Workflows and distributed environments Horst Severini 10:15 Exercise - DAGMan All 11:00 Coffee Break 11:30 A Brief Introduction to Clouds and Containers Horst Severini 12:15 Exercise - Using Containers on OSG and Discpline Specfic Tutorials All 13:00 Lunch 14:30 ATLAS Analysis Example Pat Skubic 14:30 ATLAS Analysis Exercises All 16:00 Coffee Break 16:30 More Exercise Time All 17:30 Closing Thoughts All Materials Materials Page","title":"ASP 2024 Schedule"},{"location":"ASP2024/ASP2024_Schedule/#asp-2024","text":"","title":"ASP 2024"},{"location":"ASP2024/ASP2024_Schedule/#high-level-curriculum-overview","text":"","title":"High-Level Curriculum Overview"},{"location":"ASP2024/ASP2024_Schedule/#wednesday-pm-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Wednesday PM Session"},{"location":"ASP2024/ASP2024_Schedule/#thursday-am-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Brief Introduction to clouds and containers","title":"Thursday AM Session"},{"location":"ASP2024/ASP2024_Schedule/#thursday-pm-session","text":"ATLAS Analysis Example Close out and resources for further collaboration","title":"Thursday PM Session"},{"location":"ASP2024/ASP2024_Schedule/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"ASP2024/ASP2024_Schedule/#wednesday-17-july-2024","text":"Time Description Instructor 14:30 Welcome and the Landscape of Research Computing Jae Yu 15:15 Exercise - UNIX Refresher, Running simple Condor jobs All 16:00 Coffee Break 16:30 Profiling your application and finding a home for your workflow Pat Skubic 17:15 Exercise - Running scripting and R jobs with Condor All","title":"Wednesday 17-July 2024"},{"location":"ASP2024/ASP2024_Schedule/#thursday-18-july-2024","text":"Time Description Instructor 09:30 Workflows and distributed environments Horst Severini 10:15 Exercise - DAGMan All 11:00 Coffee Break 11:30 A Brief Introduction to Clouds and Containers Horst Severini 12:15 Exercise - Using Containers on OSG and Discpline Specfic Tutorials All 13:00 Lunch 14:30 ATLAS Analysis Example Pat Skubic 14:30 ATLAS Analysis Exercises All 16:00 Coffee Break 16:30 More Exercise Time All 17:30 Closing Thoughts All","title":"Thursday 18-July 2024"},{"location":"ASP2024/ASP2024_Schedule/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2024/AnalysisExample/","text":"ATLAS Analysis Example Introduction Root may be run in batch mode on the grid to analyze large data samples. This example creates simulated data in root format using trees and performs analysis on the simulated data by means of processing on the grid. This example is based on a demo developed by OU programmer Chris Walker. Prerequisite Open a new Terminal on your local desktop. NOTE: You are no longer using the browser based terminal now, but the Terminal on your CentOS VM, just like you did to display mandle.gif with firefox. Make a directory for this exercise $ mkdir -p analysis_example $ cd analysis_example Again the $ sign at the beginning of the commands to execute is the command prompt , so it should not be entered as part of the command. Simple Analysis Example Step 1: Create simulated data using the grid Note: Since the new training VMs on OSpool do not support running root, we will run root on the local desktops instead of using condor. So we will not need the condor submit scripts below but we will leave the instructions for them for future reference. Now in your test directory we will create the three files: run-root.cmd , run-root.sh , and run-root.C with the contents given below. This may require running an editor such as emacs or nano on your local desktop. We will not submit grid jobs so the \"run-root.cmd\" script is not needed for this exercise. ----------------------------- Skip from here-1 ----------------------------------------- First, we will utilize a simple command script to submit the grid jobs. It is run-root.cmd : universe=vanilla executable=run-root.sh transfer_input_files = run-root.C transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root.log transfer_output_files = root.out,t00.root,t01.root output=run-root.out.$(Cluster).$(Process) error=run-root.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-1 ---------------------------------------------------- Note that the executable script is: run-root.sh which is as follows: #!/bin/bash # setup source /cvmfs/sft.cern.ch/lcg/views/setupViews.sh LCG_105a x86_64-ubuntu2204-gcc11-opt # execute root -b < run-root.C > root.out This script runs Root in batch mode and executes input macro run-root.C and produces output that is routed to file root.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-root.sh The macro run-root.C consists of the following code: { // create files containing simulated data TRandom g; char c[256]; for ( int j = 0 ; j < 2 ; j++ ){ sprintf(c,\"t%2.2d.root\\000\",j); TFile f(c,\"RECREATE\",\"MyFile\", 0/*no compression*/); TTree *t = new TTree(\"t0\",\"t0\"); Int_t Run; TBranch * b_Run = t->Branch(\"Run\",&Run); Int_t Event; TBranch * b_Event = t->Branch(\"Event\",&Event); Float_t Energy; TBranch * b_Energy = t->Branch(\"Energy\",&Energy); Run = j; for( Event = 0 ; Event < 100 ; Event++ ){ Energy = g.Gaus(500.0 , 200.0); t->Fill(); } f.Write(); f.Close(); } } .q We will not submit grid jobs during this exercise. So we will skip to running root. ----------------------------- Skip from here-2 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-root.cmd It can be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-root.log , and output file: root.out , and the files containing the simulated data: t00.root , t01.root in your test directory. You need to copy these files into your public directory, so that you can download it to your local desktop: $ cp t0*.root ~/public/ Now open a different terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/t00.root http://stash.osgconnect.net/~YOUR_USER_ID/t01.root ----------------------------------------------- Skip to here-2 ---------------------------------------------------- Execute the script to run root: ./run-root.sh You can then inspect the contents of t00.root and t01.root by running root in your current directory in the local terminal window: $ root t00.root And then the Root command: TBrowser b With the TBrowser you can plot the simulated data in branch Energy as well as the other branches. Double click on the name of the root files, and then on the variables you would like to plot. Each data file contains a TTree named t0 . You can plot the contents of all (in this example both) data file TTree's by using the TChain method as follows: In root execute the following commands: TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Draw(\"Energy\"); When you are done with this, you can quit root again with the command .q <Return> . Step 2: Analyze Real Data We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-3 ----------------------------------------- The grid job can be submitted using: Now we want to have a look at a real live ATLAS root file. For this, go back to the remote terminal window on osgconnect. You will need a new condor submit script called run-z.cmd : universe=vanilla executable=run-z.sh transfer_input_files = readEvents.C,muons.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-z.log transfer_output_files = root-z.out,histograms-z.root output=run-z.out.$(Cluster).$(Process) error=run-z.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-3 ---------------------------------------------------- The new executable script you need for this job is: run-z.sh which is as follows: #!/bin/bash # setup source /cvmfs/sft.cern.ch/lcg/views/setupViews.sh LCG_105a x86_64-ubuntu2204-gcc11-opt # execute root -b -q readEvents.C+ > root-z.out This script runs root in batch mode and executes input macro readEvents.C and produces output that is routed to file root-z.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-z.sh The macro readEvents.C consists of the following code: #include \"TFile.h\" #include \"TTree.h\" #include \"TCanvas.h\" #include \"TH1F.h\" #include \"iostream\" //#include \"TLorentzVector.h\" using namespace std; void readEvents(){ // load the ROOT ntuple file TFile * f = new TFile(\"muons.root\"); TTree *tree = (TTree *) f->Get(\"POOLCollectionTree\"); int nEntries = tree->GetEntries(); cout << \"There are \" << nEntries << \" entries in your ntuple\" << endl; // create local variables for the tree's branches UInt_t NLooseMuons; Float_t LooseMuonsEta1; Float_t LooseMuonsPhi1; Float_t LooseMuonsPt1; Float_t LooseMuonsEta2; Float_t LooseMuonsPhi2; Float_t LooseMuonsPt2; // set the tree's branches to the local variables tree->SetBranchAddress(\"NLooseMuon\", &NLooseMuons); tree->SetBranchAddress(\"LooseMuonEta1\", &LooseMuonsEta1); tree->SetBranchAddress(\"LooseMuonPhi1\", &LooseMuonsPhi1); tree->SetBranchAddress(\"LooseMuonPt1\", &LooseMuonsPt1); tree->SetBranchAddress(\"LooseMuonEta2\", &LooseMuonsEta2); tree->SetBranchAddress(\"LooseMuonPhi2\", &LooseMuonsPhi2); tree->SetBranchAddress(\"LooseMuonPt2\", &LooseMuonsPt2); // declare some histograms TH1F *muPt1 = new TH1F(\"muPt1\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx1 = new TH1F(\"muPx1\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy1 = new TH1F(\"muPy1\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz1 = new TH1F(\"muPz1\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta1 = new TH1F(\"muEta1\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi1 = new TH1F(\"muPhi1\", \";#phi;Events\", 50, -4, 4); TH1F *muE1 = new TH1F(\"muE1\", \";Energy;Events\", 50, 0, 200); TH1F *muPt2 = new TH1F(\"muPt2\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx2 = new TH1F(\"muPx2\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy2 = new TH1F(\"muPy2\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz2 = new TH1F(\"muPz2\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta2 = new TH1F(\"muEta2\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi2 = new TH1F(\"muPhi2\", \";#phi;Events\", 50, -4, 4); TH1F *muE2 = new TH1F(\"muE2\", \";Energy;Events\", 50, 0, 200); TH1F *zPt = new TH1F(\"zPt\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *zPx = new TH1F(\"zPx\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *zPy = new TH1F(\"zPy\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *zPz = new TH1F(\"zPz\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz //TH1F *zEta = new TH1F(\"zEta\", \";#eta;Events\", 50, -3, 3); //TH1F *zPhi = new TH1F(\"zPhi\", \";#phi;Events\", 50, -4, 4); TH1F *zE = new TH1F(\"zE\", \";Energy;Events\", 50, 0, 200); TH1F *zMass = new TH1F(\"zMass\", \";Mass;Events\", 50, 0, 200); // loop over each entry (event) in the tree for( int entry=0; entry < nEntries; entry++ ){ if( entry%10000 * 0 ) cout << \"Entry:\" << entry << endl; // check that the event is read properly int entryCheck = tree->GetEntry( entry ); if( entryCheck <= 0 ){ continue; } // only look at events containing at least 2 leptons if(NLooseMuons < 2) continue; // require the leptons to have some transverse momentum if(abs(LooseMuonsPt1) *0.001 < 20 || abs(LooseMuonsPt2) *0.001 < 20 ) continue; // make a LorentzVector from the muon //TLorentzVector Muons1; // Muons1.SetPtEtaPhiM(fabs(LooseMuonsPt1), LooseMuonsEta1, LooseMuonsPhi1, 0); // print out the details of an electron every so often if( entry%10000 * 0 ){ cout << \"Muons pt1: \" << LooseMuonsPt1 << \" eta: \" << LooseMuonsEta1 << \" phi \" << LooseMuonsPhi1 << endl; cout << \"Muons pt2: \" << LooseMuonsPt2 << \" eta: \" << LooseMuonsEta2 << \" phi \" << LooseMuonsPhi2 << endl; } //calculation of muon energy Double_t muonMass = 0.0; // assume the mass of the muon is negligible Double_t muonPx1 = abs(LooseMuonsPt1)*cos(LooseMuonsPhi1); Double_t muonPy1 = abs(LooseMuonsPt1)*sin(LooseMuonsPhi1); Double_t muonPz1 = abs(LooseMuonsPt1)*sinh(LooseMuonsEta1); Double_t muonEnergy1 = sqrt (muonPx1*muonPx1 + muonPy1*muonPy1 + muonPz1*muonPz1 + muonMass*muonMass); Double_t muonPx2 = abs(LooseMuonsPt2)*cos(LooseMuonsPhi2); Double_t muonPy2 = abs(LooseMuonsPt2)*sin(LooseMuonsPhi2); Double_t muonPz2 = abs(LooseMuonsPt2)*sinh(LooseMuonsEta2); Double_t muonEnergy2 = sqrt (muonPx2*muonPx2 + muonPy2*muonPy2 + muonPz2*muonPz2 + muonMass*muonMass); Double_t zCompX = muonPx1 + muonPx2; Double_t zCompY = muonPy1 + muonPy2; Double_t zLongi = muonPz1 + muonPz2; Double_t zPerp = sqrt (zCompX*zCompX + zCompY*zCompY); Double_t zEnergy = muonEnergy1 + muonEnergy2; Double_t zM = sqrt (zEnergy*zEnergy -zCompX*zCompX -zCompY*zCompY -zLongi*zLongi); // fill our histograms muPt1->Fill((LooseMuonsPt1)*0.001); // in GeV muEta1->Fill(LooseMuonsEta1); muPhi1->Fill(LooseMuonsPhi1); muPx1->Fill( muonPx1*0.001); // in GeV muPy1->Fill( muonPy1*0.001); // in GeV muPz1->Fill( muonPz1*0.001); // in GeV muE1->Fill(muonEnergy1*0.001); // in GeV muPt2->Fill((LooseMuonsPt2)*0.001); // in GeV muEta2->Fill(LooseMuonsEta2); muPhi2->Fill(LooseMuonsPhi2); muPx2->Fill( muonPx2*0.001); // in GeV muPy2->Fill( muonPy2*0.001); // in GeV muPz2->Fill( muonPz2*0.001); // in GeV muE2->Fill(muonEnergy2*0.001); // in GeV zPt->Fill( zPerp*0.001); // in GeV zPx->Fill( zCompX*0.001); // in GeV zPy->Fill( zCompY*0.001); // in GeV zPz->Fill( zLongi*0.001); // in GeV zE->Fill( zEnergy*0.001); // in GeV zMass->Fill(zM*0.001); // in GeV } // draw the eta distribution zMass->Draw(); // make a ROOT output file to store your histograms TFile *outFile = new TFile(\"histograms-z.root\", \"recreate\"); muPt1->Write(); muEta1->Write(); muPhi1->Write(); muE1->Write(); muPx1->Write(); muPy1->Write(); muPz1->Write(); muPt2->Write(); muEta2->Write(); muPhi2->Write(); muE2->Write(); muPx2->Write(); muPy2->Write(); muPz2->Write(); zPt->Write(); zE->Write(); zPx->Write(); zPy->Write(); zPz->Write(); zMass->Write(); outFile->Close(); } We will not submit grid jobs during this exercise. So we will skip to run root. ----------------------------- Skip from here-4 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-z.cmd It can again be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-z.log , and output file: root-z.out , and the files containing the simulated data: histograms-z.root in your test directory. You again need to copy that file into your public directory, so that you can download it to your local desktop: $ cp histograms-z.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms-z.root ----------------------------------------------- Skip to here-4 ---------------------------------------------------- Setup a soft link to the input data file, muons.root, and execute the script to run root: ln -s /opt/data/muons.root . ./run-z.sh You can inspect the contents of histograms-z.root by running Root (i.e., root histograms-z.root ) in your current directory in your local terminal window: $ root histograms-z.root And then using the Root command: TBrowser b With the TBrowser you can plot the variables in the root file. Double click on histograms-z.root , and then on the variables to plot them. Step 3: Make TSelector Now let's go back to the files created in step 1, in the local terminal window. Start root in your test directory with the following commands: $ root -b And then execute the following commands: TFile f(\"t00.root\"); t0->MakeSelector(\"s0\",\"=legacy\"); f.Close(); .q This will create files s0.C and s0.h in your test directory that contain code corresponding to the definition of the TTree t0 . This code can be used to process files containing data in these TTree's. Now we will add a histogram to the TSelector code. Several code lines have to be added to the TSelector code files s0.C and s0.h . To s0.h make the following additions: after existing include statements add: #include <TH1F.h> After class s0 definition: class s0 : public TSelector { public : add TH1F *e; To s0.C make the following additions: After entry: void s0::SlaveBegin(TTree * /*tree*/) { add e = new TH1F(\"e\", \"e\", 1000, -199.0, 1200.0); After Process entry: Bool_t s0::Process(Long64_t entry) { add GetEntry(entry); e->Fill(Energy); After terminate entry: void s0::Terminate() { add TFile f(\"histograms.root\",\"RECREATE\"); f.WriteObject(e,\"Energy\"); f.Close(); We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-5 ----------------------------------------- Now create the new script files for Step 2: create run-root-2.cmd : universe=vanilla executable=run-root-2.sh transfer_input_files = s0.C,s0.h,run-root-2.C,t00.root,t01.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root-2.log transfer_output_files = root-2.out,histograms.root output=run-root-2.out.$(Cluster).$(Process) error=run-root-2.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-5 ---------------------------------------------------- Create run-root-2.sh : #!/bin/bash root -b < run-root-2.C > root-2.out It has to be made executable, by use of the chmod Linux command: chmod +x run-root-2.sh Create run-root-2.C .L s0.C++ { //Load and run TSelector s0 *s = new s0(); TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Process(s); } We can test the root job on the local machine by executing the script to run root: ./run-root-2.sh We will not submit grid jobs during this exercise. So we will skip running condor. ----------------------------- Skip from here-6 ----------------------------------------- If this works, we can process the data files t00.root and t01.root on the Grid with our new command script run-root-2.cmd . This can be done with command: condor_submit run-root-2.cmd Once your job has finished, you again need to copy that file into your public directory, so that you can download it to your local desktop: cp histograms.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms.root ----------------------------------------------- Skip to here-6 ---------------------------------------------------- You can look at the output histogram file: histograms.root with TBrowser b as before, in your local terminal window.","title":"ATLAS Analysis Example"},{"location":"ASP2024/AnalysisExample/#atlas-analysis-example","text":"","title":"ATLAS Analysis Example"},{"location":"ASP2024/AnalysisExample/#introduction","text":"Root may be run in batch mode on the grid to analyze large data samples. This example creates simulated data in root format using trees and performs analysis on the simulated data by means of processing on the grid. This example is based on a demo developed by OU programmer Chris Walker.","title":"Introduction"},{"location":"ASP2024/AnalysisExample/#prerequisite","text":"Open a new Terminal on your local desktop. NOTE: You are no longer using the browser based terminal now, but the Terminal on your CentOS VM, just like you did to display mandle.gif with firefox. Make a directory for this exercise $ mkdir -p analysis_example $ cd analysis_example Again the $ sign at the beginning of the commands to execute is the command prompt , so it should not be entered as part of the command.","title":"Prerequisite"},{"location":"ASP2024/AnalysisExample/#simple-analysis-example","text":"","title":"Simple Analysis Example"},{"location":"ASP2024/AnalysisExample/#step-1-create-simulated-data-using-the-grid","text":"Note: Since the new training VMs on OSpool do not support running root, we will run root on the local desktops instead of using condor. So we will not need the condor submit scripts below but we will leave the instructions for them for future reference. Now in your test directory we will create the three files: run-root.cmd , run-root.sh , and run-root.C with the contents given below. This may require running an editor such as emacs or nano on your local desktop. We will not submit grid jobs so the \"run-root.cmd\" script is not needed for this exercise. ----------------------------- Skip from here-1 ----------------------------------------- First, we will utilize a simple command script to submit the grid jobs. It is run-root.cmd : universe=vanilla executable=run-root.sh transfer_input_files = run-root.C transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root.log transfer_output_files = root.out,t00.root,t01.root output=run-root.out.$(Cluster).$(Process) error=run-root.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-1 ---------------------------------------------------- Note that the executable script is: run-root.sh which is as follows: #!/bin/bash # setup source /cvmfs/sft.cern.ch/lcg/views/setupViews.sh LCG_105a x86_64-ubuntu2204-gcc11-opt # execute root -b < run-root.C > root.out This script runs Root in batch mode and executes input macro run-root.C and produces output that is routed to file root.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-root.sh The macro run-root.C consists of the following code: { // create files containing simulated data TRandom g; char c[256]; for ( int j = 0 ; j < 2 ; j++ ){ sprintf(c,\"t%2.2d.root\\000\",j); TFile f(c,\"RECREATE\",\"MyFile\", 0/*no compression*/); TTree *t = new TTree(\"t0\",\"t0\"); Int_t Run; TBranch * b_Run = t->Branch(\"Run\",&Run); Int_t Event; TBranch * b_Event = t->Branch(\"Event\",&Event); Float_t Energy; TBranch * b_Energy = t->Branch(\"Energy\",&Energy); Run = j; for( Event = 0 ; Event < 100 ; Event++ ){ Energy = g.Gaus(500.0 , 200.0); t->Fill(); } f.Write(); f.Close(); } } .q We will not submit grid jobs during this exercise. So we will skip to running root. ----------------------------- Skip from here-2 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-root.cmd It can be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-root.log , and output file: root.out , and the files containing the simulated data: t00.root , t01.root in your test directory. You need to copy these files into your public directory, so that you can download it to your local desktop: $ cp t0*.root ~/public/ Now open a different terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/t00.root http://stash.osgconnect.net/~YOUR_USER_ID/t01.root ----------------------------------------------- Skip to here-2 ---------------------------------------------------- Execute the script to run root: ./run-root.sh You can then inspect the contents of t00.root and t01.root by running root in your current directory in the local terminal window: $ root t00.root And then the Root command: TBrowser b With the TBrowser you can plot the simulated data in branch Energy as well as the other branches. Double click on the name of the root files, and then on the variables you would like to plot. Each data file contains a TTree named t0 . You can plot the contents of all (in this example both) data file TTree's by using the TChain method as follows: In root execute the following commands: TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Draw(\"Energy\"); When you are done with this, you can quit root again with the command .q <Return> .","title":"Step 1: Create simulated data using the grid"},{"location":"ASP2024/AnalysisExample/#step-2-analyze-real-data","text":"We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-3 ----------------------------------------- The grid job can be submitted using: Now we want to have a look at a real live ATLAS root file. For this, go back to the remote terminal window on osgconnect. You will need a new condor submit script called run-z.cmd : universe=vanilla executable=run-z.sh transfer_input_files = readEvents.C,muons.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-z.log transfer_output_files = root-z.out,histograms-z.root output=run-z.out.$(Cluster).$(Process) error=run-z.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-3 ---------------------------------------------------- The new executable script you need for this job is: run-z.sh which is as follows: #!/bin/bash # setup source /cvmfs/sft.cern.ch/lcg/views/setupViews.sh LCG_105a x86_64-ubuntu2204-gcc11-opt # execute root -b -q readEvents.C+ > root-z.out This script runs root in batch mode and executes input macro readEvents.C and produces output that is routed to file root-z.out . It has to be made executable, by use of the chmod Linux command (protections can be checked with the command ls -l ): $ chmod +x run-z.sh The macro readEvents.C consists of the following code: #include \"TFile.h\" #include \"TTree.h\" #include \"TCanvas.h\" #include \"TH1F.h\" #include \"iostream\" //#include \"TLorentzVector.h\" using namespace std; void readEvents(){ // load the ROOT ntuple file TFile * f = new TFile(\"muons.root\"); TTree *tree = (TTree *) f->Get(\"POOLCollectionTree\"); int nEntries = tree->GetEntries(); cout << \"There are \" << nEntries << \" entries in your ntuple\" << endl; // create local variables for the tree's branches UInt_t NLooseMuons; Float_t LooseMuonsEta1; Float_t LooseMuonsPhi1; Float_t LooseMuonsPt1; Float_t LooseMuonsEta2; Float_t LooseMuonsPhi2; Float_t LooseMuonsPt2; // set the tree's branches to the local variables tree->SetBranchAddress(\"NLooseMuon\", &NLooseMuons); tree->SetBranchAddress(\"LooseMuonEta1\", &LooseMuonsEta1); tree->SetBranchAddress(\"LooseMuonPhi1\", &LooseMuonsPhi1); tree->SetBranchAddress(\"LooseMuonPt1\", &LooseMuonsPt1); tree->SetBranchAddress(\"LooseMuonEta2\", &LooseMuonsEta2); tree->SetBranchAddress(\"LooseMuonPhi2\", &LooseMuonsPhi2); tree->SetBranchAddress(\"LooseMuonPt2\", &LooseMuonsPt2); // declare some histograms TH1F *muPt1 = new TH1F(\"muPt1\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx1 = new TH1F(\"muPx1\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy1 = new TH1F(\"muPy1\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz1 = new TH1F(\"muPz1\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta1 = new TH1F(\"muEta1\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi1 = new TH1F(\"muPhi1\", \";#phi;Events\", 50, -4, 4); TH1F *muE1 = new TH1F(\"muE1\", \";Energy;Events\", 50, 0, 200); TH1F *muPt2 = new TH1F(\"muPt2\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *muPx2 = new TH1F(\"muPx2\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *muPy2 = new TH1F(\"muPy2\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *muPz2 = new TH1F(\"muPz2\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz TH1F *muEta2 = new TH1F(\"muEta2\", \";#eta;Events\", 50, -3, 3); TH1F *muPhi2 = new TH1F(\"muPhi2\", \";#phi;Events\", 50, -4, 4); TH1F *muE2 = new TH1F(\"muE2\", \";Energy;Events\", 50, 0, 200); TH1F *zPt = new TH1F(\"zPt\", \";p_{T} [GeV/c];Events\", 50, 0, 200); TH1F *zPx = new TH1F(\"zPx\", \";p_{x} [GeV/c];Events\", 50, 0, 200); //added px TH1F *zPy = new TH1F(\"zPy\", \";p_{y} [GeV/c];Events\", 50, 0, 200); //added py TH1F *zPz = new TH1F(\"zPz\", \";p_{z} [GeV/c];Events\", 50, 0, 200); //added pz //TH1F *zEta = new TH1F(\"zEta\", \";#eta;Events\", 50, -3, 3); //TH1F *zPhi = new TH1F(\"zPhi\", \";#phi;Events\", 50, -4, 4); TH1F *zE = new TH1F(\"zE\", \";Energy;Events\", 50, 0, 200); TH1F *zMass = new TH1F(\"zMass\", \";Mass;Events\", 50, 0, 200); // loop over each entry (event) in the tree for( int entry=0; entry < nEntries; entry++ ){ if( entry%10000 * 0 ) cout << \"Entry:\" << entry << endl; // check that the event is read properly int entryCheck = tree->GetEntry( entry ); if( entryCheck <= 0 ){ continue; } // only look at events containing at least 2 leptons if(NLooseMuons < 2) continue; // require the leptons to have some transverse momentum if(abs(LooseMuonsPt1) *0.001 < 20 || abs(LooseMuonsPt2) *0.001 < 20 ) continue; // make a LorentzVector from the muon //TLorentzVector Muons1; // Muons1.SetPtEtaPhiM(fabs(LooseMuonsPt1), LooseMuonsEta1, LooseMuonsPhi1, 0); // print out the details of an electron every so often if( entry%10000 * 0 ){ cout << \"Muons pt1: \" << LooseMuonsPt1 << \" eta: \" << LooseMuonsEta1 << \" phi \" << LooseMuonsPhi1 << endl; cout << \"Muons pt2: \" << LooseMuonsPt2 << \" eta: \" << LooseMuonsEta2 << \" phi \" << LooseMuonsPhi2 << endl; } //calculation of muon energy Double_t muonMass = 0.0; // assume the mass of the muon is negligible Double_t muonPx1 = abs(LooseMuonsPt1)*cos(LooseMuonsPhi1); Double_t muonPy1 = abs(LooseMuonsPt1)*sin(LooseMuonsPhi1); Double_t muonPz1 = abs(LooseMuonsPt1)*sinh(LooseMuonsEta1); Double_t muonEnergy1 = sqrt (muonPx1*muonPx1 + muonPy1*muonPy1 + muonPz1*muonPz1 + muonMass*muonMass); Double_t muonPx2 = abs(LooseMuonsPt2)*cos(LooseMuonsPhi2); Double_t muonPy2 = abs(LooseMuonsPt2)*sin(LooseMuonsPhi2); Double_t muonPz2 = abs(LooseMuonsPt2)*sinh(LooseMuonsEta2); Double_t muonEnergy2 = sqrt (muonPx2*muonPx2 + muonPy2*muonPy2 + muonPz2*muonPz2 + muonMass*muonMass); Double_t zCompX = muonPx1 + muonPx2; Double_t zCompY = muonPy1 + muonPy2; Double_t zLongi = muonPz1 + muonPz2; Double_t zPerp = sqrt (zCompX*zCompX + zCompY*zCompY); Double_t zEnergy = muonEnergy1 + muonEnergy2; Double_t zM = sqrt (zEnergy*zEnergy -zCompX*zCompX -zCompY*zCompY -zLongi*zLongi); // fill our histograms muPt1->Fill((LooseMuonsPt1)*0.001); // in GeV muEta1->Fill(LooseMuonsEta1); muPhi1->Fill(LooseMuonsPhi1); muPx1->Fill( muonPx1*0.001); // in GeV muPy1->Fill( muonPy1*0.001); // in GeV muPz1->Fill( muonPz1*0.001); // in GeV muE1->Fill(muonEnergy1*0.001); // in GeV muPt2->Fill((LooseMuonsPt2)*0.001); // in GeV muEta2->Fill(LooseMuonsEta2); muPhi2->Fill(LooseMuonsPhi2); muPx2->Fill( muonPx2*0.001); // in GeV muPy2->Fill( muonPy2*0.001); // in GeV muPz2->Fill( muonPz2*0.001); // in GeV muE2->Fill(muonEnergy2*0.001); // in GeV zPt->Fill( zPerp*0.001); // in GeV zPx->Fill( zCompX*0.001); // in GeV zPy->Fill( zCompY*0.001); // in GeV zPz->Fill( zLongi*0.001); // in GeV zE->Fill( zEnergy*0.001); // in GeV zMass->Fill(zM*0.001); // in GeV } // draw the eta distribution zMass->Draw(); // make a ROOT output file to store your histograms TFile *outFile = new TFile(\"histograms-z.root\", \"recreate\"); muPt1->Write(); muEta1->Write(); muPhi1->Write(); muE1->Write(); muPx1->Write(); muPy1->Write(); muPz1->Write(); muPt2->Write(); muEta2->Write(); muPhi2->Write(); muE2->Write(); muPx2->Write(); muPy2->Write(); muPz2->Write(); zPt->Write(); zE->Write(); zPx->Write(); zPy->Write(); zPz->Write(); zMass->Write(); outFile->Close(); } We will not submit grid jobs during this exercise. So we will skip to run root. ----------------------------- Skip from here-4 ----------------------------------------- The grid job can be submitted using: $ condor_submit run-z.cmd It can again be checked with: $ condor_q YOUR_USER_ID -nobatch After it runs, you will find a log file that describes the job: run-z.log , and output file: root-z.out , and the files containing the simulated data: histograms-z.root in your test directory. You again need to copy that file into your public directory, so that you can download it to your local desktop: $ cp histograms-z.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: $ wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms-z.root ----------------------------------------------- Skip to here-4 ---------------------------------------------------- Setup a soft link to the input data file, muons.root, and execute the script to run root: ln -s /opt/data/muons.root . ./run-z.sh You can inspect the contents of histograms-z.root by running Root (i.e., root histograms-z.root ) in your current directory in your local terminal window: $ root histograms-z.root And then using the Root command: TBrowser b With the TBrowser you can plot the variables in the root file. Double click on histograms-z.root , and then on the variables to plot them.","title":"Step 2: Analyze Real Data"},{"location":"ASP2024/AnalysisExample/#step-3-make-tselector","text":"Now let's go back to the files created in step 1, in the local terminal window. Start root in your test directory with the following commands: $ root -b And then execute the following commands: TFile f(\"t00.root\"); t0->MakeSelector(\"s0\",\"=legacy\"); f.Close(); .q This will create files s0.C and s0.h in your test directory that contain code corresponding to the definition of the TTree t0 . This code can be used to process files containing data in these TTree's. Now we will add a histogram to the TSelector code. Several code lines have to be added to the TSelector code files s0.C and s0.h . To s0.h make the following additions: after existing include statements add: #include <TH1F.h> After class s0 definition: class s0 : public TSelector { public : add TH1F *e; To s0.C make the following additions: After entry: void s0::SlaveBegin(TTree * /*tree*/) { add e = new TH1F(\"e\", \"e\", 1000, -199.0, 1200.0); After Process entry: Bool_t s0::Process(Long64_t entry) { add GetEntry(entry); e->Fill(Energy); After terminate entry: void s0::Terminate() { add TFile f(\"histograms.root\",\"RECREATE\"); f.WriteObject(e,\"Energy\"); f.Close(); We will not submit grid jobs during this exercise. So we will skip submit script. ----------------------------- Skip from here-5 ----------------------------------------- Now create the new script files for Step 2: create run-root-2.cmd : universe=vanilla executable=run-root-2.sh transfer_input_files = s0.C,s0.h,run-root-2.C,t00.root,t01.root transfer_executable=True when_to_transfer_output = ON_EXIT log=run-root-2.log transfer_output_files = root-2.out,histograms.root output=run-root-2.out.$(Cluster).$(Process) error=run-root-2.err.$(Cluster).$(Process) notification=Never queue ----------------------------------------------- Skip to here-5 ---------------------------------------------------- Create run-root-2.sh : #!/bin/bash root -b < run-root-2.C > root-2.out It has to be made executable, by use of the chmod Linux command: chmod +x run-root-2.sh Create run-root-2.C .L s0.C++ { //Load and run TSelector s0 *s = new s0(); TChain tc(\"t0\"); tc.Add(\"t*.root\"); tc.Process(s); } We can test the root job on the local machine by executing the script to run root: ./run-root-2.sh We will not submit grid jobs during this exercise. So we will skip running condor. ----------------------------- Skip from here-6 ----------------------------------------- If this works, we can process the data files t00.root and t01.root on the Grid with our new command script run-root-2.cmd . This can be done with command: condor_submit run-root-2.cmd Once your job has finished, you again need to copy that file into your public directory, so that you can download it to your local desktop: cp histograms.root ~/public/ Go back to the local terminal window on your local desktop, and download the root files with: wget http://stash.osgconnect.net/~YOUR_USER_ID/histograms.root ----------------------------------------------- Skip to here-6 ---------------------------------------------------- You can look at the output histogram file: histograms.root with TBrowser b as before, in your local terminal window.","title":"Step 3: Make TSelector"},{"location":"ASP2024/PH_DT2017/","text":"Polar Hackathon 2017 High-Level Cirriculum Overview Wednesday Session 1 Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Wednesday Session 2 Workflows and distributed environments Thursday Sessions Containers Jetstream demo and discussion Materials Materials Page","title":"Polar Hackathon 2017"},{"location":"ASP2024/PH_DT2017/#polar-hackathon-2017","text":"","title":"Polar Hackathon 2017"},{"location":"ASP2024/PH_DT2017/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"ASP2024/PH_DT2017/#wednesday-session-1","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Wednesday Session 1"},{"location":"ASP2024/PH_DT2017/#wednesday-session-2","text":"Workflows and distributed environments","title":"Wednesday Session 2"},{"location":"ASP2024/PH_DT2017/#thursday-sessions","text":"Containers Jetstream demo and discussion","title":"Thursday Sessions"},{"location":"ASP2024/PH_DT2017/#materials","text":"Materials Page","title":"Materials"},{"location":"ASP2024/PH_Materials/","text":"Polar Hackathon School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Wednesday - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Wednesday - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday - Computational Infrastructures - Session 3 Lecture 4 - Slides Containers with Singularity Jetstream Demo Possible if Time - Workflows Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow Extras Discipline specific tutorials Contact information Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Polar Hackathon School Materials"},{"location":"ASP2024/PH_Materials/#polar-hackathon-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Polar Hackathon School Materials"},{"location":"ASP2024/PH_Materials/#wednesday-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Wednesday - Computational Infrastructures - Session 1"},{"location":"ASP2024/PH_Materials/#wednesday-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Wednesday - Computational Infrastructures - Session 2"},{"location":"ASP2024/PH_Materials/#thursday-computational-infrastructures-session-3","text":"Lecture 4 - Slides Containers with Singularity Jetstream Demo","title":"Thursday - Computational Infrastructures - Session 3"},{"location":"ASP2024/PH_Materials/#possible-if-time-workflows","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow","title":"Possible if Time - Workflows"},{"location":"ASP2024/PH_Materials/#extras","text":"Discipline specific tutorials","title":"Extras"},{"location":"ASP2024/PH_Materials/#contact-information","text":"Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"ASP2024/School/","text":"Data Trieste 2017 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Friday AM Session Introduction to cloud environemts Close out and resources for further collaboration Detailed Schedule Thursday 20-July 2017 Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick Friday 21-July 2017 Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch Materials Materials Page","title":"Data Trieste 2017"},{"location":"ASP2024/School/#data-trieste-2017","text":"","title":"Data Trieste 2017"},{"location":"ASP2024/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"ASP2024/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"ASP2024/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Thursday PM Session"},{"location":"ASP2024/School/#friday-am-session","text":"Introduction to cloud environemts Close out and resources for further collaboration","title":"Friday AM Session"},{"location":"ASP2024/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"ASP2024/School/#thursday-20-july-2017","text":"Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick","title":"Thursday 20-July 2017"},{"location":"ASP2024/School/#friday-21-july-2017","text":"Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch","title":"Friday 21-July 2017"},{"location":"ASP2024/School/#materials","text":"Materials Page","title":"Materials"},{"location":"DataKigali2018/","text":"Placeholder.","title":"Index"},{"location":"DataKigali2018/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.12 manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes. Where you will work Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"High Throughput Computing and Condor Introduction"},{"location":"DataKigali2018/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"DataKigali2018/01-Introduction/#preliminaries","text":"You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.12 manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"DataKigali2018/01-Introduction/#which-condor","text":"We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes.","title":"Which Condor?"},{"location":"DataKigali2018/01-Introduction/#where-you-will-work","text":"Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"DataKigali2018/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"The Exercises"},{"location":"DataKigali2018/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob. Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.10 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.10 (Carbon) Release: 6.10 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"DataKigali2018/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"DataKigali2018/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"DataKigali2018/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob.","title":"Login to the Condor submit computer"},{"location":"DataKigali2018/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux.","title":"Looking at our Condor installation"},{"location":"DataKigali2018/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.10 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.10 (Carbon) Release: 6.10 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"DataKigali2018/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"DataKigali2018/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"DataKigali2018/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"DataKigali2018/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"DataKigali2018/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"DataKigali2018/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"DataKigali2018/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"DataKigali2018/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"DataKigali2018/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"DataKigali2018/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"DataKigali2018/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"DataKigali2018/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"DataKigali2018/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"DataKigali2018/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"DataKigali2018/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"DataKigali2018/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"DataKigali2018/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"DataKigali2018/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"DataKigali2018/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"DataKigali2018/04-TipsandTricks/#tips-for-condor_q","text":"condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"DataKigali2018/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"DataKigali2018/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"DataKigali2018/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"DataKigali2018/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"DataKigali2018/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"DataKigali2018/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"DataKigali2018/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"Running a job with R"},{"location":"DataKigali2018/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"DataKigali2018/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"DataKigali2018/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"DataKigali2018/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"DataKigali2018/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"DataKigali2018/06-RJob/#setup","text":"OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"DataKigali2018/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"DataKigali2018/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"The answer"},{"location":"DataKigali2018/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"DataKigali2018/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"DataKigali2018/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"DataKigali2018/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"DataKigali2018/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"DataKigali2018/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot , A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif You will see what looks like an error, but does not affect the image stitching. You can ignore this. montage: unable to read font `(null)' @ error/annotate.c/RenderFreetype/1339. This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above, or more simply by moving the file to a web accessible location. cp mandle.gif ~/public point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"A brief detour through the Mandlebrot set"},{"location":"DataKigali2018/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot ,","title":"A brief detour through the Mandlebrot set"},{"location":"DataKigali2018/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"DataKigali2018/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif You will see what looks like an error, but does not affect the image stitching. You can ignore this. montage: unable to read font `(null)' @ error/annotate.c/RenderFreetype/1339. This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"DataKigali2018/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above, or more simply by moving the file to a web accessible location. cp mandle.gif ~/public point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"Try it!"},{"location":"DataKigali2018/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataKigali2018/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataKigali2018/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"DataKigali2018/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"DataKigali2018/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"DataKigali2018/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"DataKigali2018/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot goatbrot1.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue wrapper_montage.sh Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"DataKigali2018/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"DataKigali2018/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"DataKigali2018/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot","title":"Make your job submission files"},{"location":"DataKigali2018/10-ComplexDAG/#goatbrot1sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"DataKigali2018/10-ComplexDAG/#goatbrot2sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"DataKigali2018/10-ComplexDAG/#goatbrot3sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"DataKigali2018/10-ComplexDAG/#goatbrot4sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"DataKigali2018/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"montage.sub"},{"location":"DataKigali2018/10-ComplexDAG/#wrapper_montagesh","text":"Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif","title":"wrapper_montage.sh"},{"location":"DataKigali2018/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"DataKigali2018/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"DataKigali2018/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"DataKigali2018/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"DataKigali2018/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up. Challenge If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Handling a DAG that fails"},{"location":"DataKigali2018/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"DataKigali2018/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"DataKigali2018/11-HandlingFailure/#challenge","text":"If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Challenge"},{"location":"DataKigali2018/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"DataKigali2018/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"DataKigali2018/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"DataKigali2018/12-VariableSubstitution/#declare-your-variables","text":"First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"DataKigali2018/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"DataKigali2018/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"DataKigali2018/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"DataKigali2018/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"DataKigali2018/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"DataKigali2018/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"DataKigali2018/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"DataKigali2018/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"DataKigali2018/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"DataKigali2018/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"DataKigali2018/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"DataKigali2018/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"DataKigali2018/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"DataKigali2018/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"DataKigali2018/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"DataKigali2018/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"DataKigali2018/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"DataKigali2018/Materials/","text":"Data Kigali School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Thursday Morning - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Thursday Morning - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday Afternoon - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Thursday Aftenoon - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Friday Morning - Computational Infrastructures - Session 5 Lecture 5 - Persistent Identifiers, Digital Objects, an Architecture for a Data Centeric Internet Friday Morning - Computational Infrastructures - Session 6 Getting Involved with CODATA, RDA, and the Foundational Schools of Research Data Science Contact information Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Data Kigali 2018 Materials"},{"location":"DataKigali2018/Materials/#data-kigali-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Data Kigali School Materials"},{"location":"DataKigali2018/Materials/#thursday-morning-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Thursday Morning - Computational Infrastructures - Session 1"},{"location":"DataKigali2018/Materials/#thursday-morning-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Thursday Morning - Computational Infrastructures - Session 2"},{"location":"DataKigali2018/Materials/#thursday-afternoon-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Thursday Afternoon - Computational Infrastructures - Session 3"},{"location":"DataKigali2018/Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"DataKigali2018/Materials/#thursday-aftenoon-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Thursday Aftenoon - Computational Infrastructures - Session 4"},{"location":"DataKigali2018/Materials/#friday-morning-computational-infrastructures-session-5","text":"Lecture 5 - Persistent Identifiers, Digital Objects, an Architecture for a Data Centeric Internet","title":"Friday Morning - Computational Infrastructures - Session 5"},{"location":"DataKigali2018/Materials/#friday-morning-computational-infrastructures-session-6","text":"Getting Involved with CODATA, RDA, and the Foundational Schools of Research Data Science","title":"Friday Morning - Computational Infrastructures - Session 6"},{"location":"DataKigali2018/Materials/#contact-information","text":"Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"DataKigali2018/School/","text":"Data Kigali 2018 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Friday AM Session A few words on clouds and containers Close out and resources for further collaboration Disclipline spefic tutorials Detailed Schedule Thursday 16-August-2018 Time Discription Instructor 09:30 Welcome and the Landscape of Research Computing Rob Quick 10:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 11:00 Coffee Break 11:30 Profiling your application and finding a home for your workflow Rob Quick 12:00 Exercise - Single and batch submittion with HTCondor Rob Quick 12:30 Lunch 13:30 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:30 Coffee Break 16:00 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 17:15 Exercise - DAGMAN Rob Quick Friday 17-August-2018 Time Discription Instructor 09:30 Clouds, Containers and Parting Words Rob Quick 10:00 Exercise - Complete Earlier Lessons Rob Quick 11:00 Coffee Break 11:30 Becoming Involved and Closing Thoughts Rob Quick 13:30 Lunch Materials Materials Page","title":"Data Kigali 2018 Schedule"},{"location":"DataKigali2018/School/#data-kigali-2018","text":"","title":"Data Kigali 2018"},{"location":"DataKigali2018/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"DataKigali2018/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"DataKigali2018/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Thursday PM Session"},{"location":"DataKigali2018/School/#friday-am-session","text":"A few words on clouds and containers Close out and resources for further collaboration Disclipline spefic tutorials","title":"Friday AM Session"},{"location":"DataKigali2018/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"DataKigali2018/School/#thursday-16-august-2018","text":"Time Discription Instructor 09:30 Welcome and the Landscape of Research Computing Rob Quick 10:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 11:00 Coffee Break 11:30 Profiling your application and finding a home for your workflow Rob Quick 12:00 Exercise - Single and batch submittion with HTCondor Rob Quick 12:30 Lunch 13:30 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:30 Coffee Break 16:00 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 17:15 Exercise - DAGMAN Rob Quick","title":"Thursday 16-August-2018"},{"location":"DataKigali2018/School/#friday-17-august-2018","text":"Time Discription Instructor 09:30 Clouds, Containers and Parting Words Rob Quick 10:00 Exercise - Complete Earlier Lessons Rob Quick 11:00 Coffee Break 11:30 Becoming Involved and Closing Thoughts Rob Quick 13:30 Lunch","title":"Friday 17-August-2018"},{"location":"DataKigali2018/School/#materials","text":"Materials Page","title":"Materials"},{"location":"DataSaoPaulo2018/","text":"Placeholder.","title":"Index"},{"location":"DataSaoPaulo2018/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.12 manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes. Where you will work Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"High Throughput Computing and Condor Introduction"},{"location":"DataSaoPaulo2018/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"DataSaoPaulo2018/01-Introduction/#preliminaries","text":"You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.12 manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"DataSaoPaulo2018/01-Introduction/#which-condor","text":"We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes.","title":"Which Condor?"},{"location":"DataSaoPaulo2018/01-Introduction/#where-you-will-work","text":"Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"DataSaoPaulo2018/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"The Exercises"},{"location":"DataSaoPaulo2018/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob. Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.10 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.10 (Carbon) Release: 6.10 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"DataSaoPaulo2018/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"DataSaoPaulo2018/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"DataSaoPaulo2018/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob.","title":"Login to the Condor submit computer"},{"location":"DataSaoPaulo2018/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux.","title":"Looking at our Condor installation"},{"location":"DataSaoPaulo2018/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.10 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.10 (Carbon) Release: 6.10 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"DataSaoPaulo2018/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"DataSaoPaulo2018/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"DataSaoPaulo2018/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"DataSaoPaulo2018/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"DataSaoPaulo2018/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"DataSaoPaulo2018/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"DataSaoPaulo2018/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"DataSaoPaulo2018/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"DataSaoPaulo2018/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"DataSaoPaulo2018/04-TipsandTricks/#tips-for-condor_q","text":"condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"DataSaoPaulo2018/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"DataSaoPaulo2018/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"DataSaoPaulo2018/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"DataSaoPaulo2018/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"DataSaoPaulo2018/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"DataSaoPaulo2018/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"DataSaoPaulo2018/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"Running a job with R"},{"location":"DataSaoPaulo2018/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"DataSaoPaulo2018/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"DataSaoPaulo2018/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"DataSaoPaulo2018/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"DataSaoPaulo2018/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"DataSaoPaulo2018/06-RJob/#setup","text":"OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"DataSaoPaulo2018/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"DataSaoPaulo2018/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"The answer"},{"location":"DataSaoPaulo2018/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"DataSaoPaulo2018/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"DataSaoPaulo2018/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"DataSaoPaulo2018/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"DataSaoPaulo2018/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"DataSaoPaulo2018/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot , A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif You will see what looks like an error, but does not affect the image stitching. You can ignore this. montage: unable to read font `(null)' @ error/annotate.c/RenderFreetype/1339. This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above, or more simply by moving the file to a web accessible location. cp mandle.gif ~/public point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"A brief detour through the Mandlebrot set"},{"location":"DataSaoPaulo2018/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot ,","title":"A brief detour through the Mandlebrot set"},{"location":"DataSaoPaulo2018/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"DataSaoPaulo2018/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif You will see what looks like an error, but does not affect the image stitching. You can ignore this. montage: unable to read font `(null)' @ error/annotate.c/RenderFreetype/1339. This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"DataSaoPaulo2018/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above, or more simply by moving the file to a web accessible location. cp mandle.gif ~/public point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"Try it!"},{"location":"DataSaoPaulo2018/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataSaoPaulo2018/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataSaoPaulo2018/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"DataSaoPaulo2018/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"DataSaoPaulo2018/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"DataSaoPaulo2018/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"DataSaoPaulo2018/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot goatbrot1.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue wrapper_montage.sh Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot","title":"Make your job submission files"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#goatbrot1sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#goatbrot2sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#goatbrot3sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#goatbrot4sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"montage.sub"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#wrapper_montagesh","text":"Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif","title":"wrapper_montage.sh"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"DataSaoPaulo2018/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"DataSaoPaulo2018/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up. Challenge If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Handling a DAG that fails"},{"location":"DataSaoPaulo2018/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"DataSaoPaulo2018/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"DataSaoPaulo2018/11-HandlingFailure/#challenge","text":"If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Challenge"},{"location":"DataSaoPaulo2018/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"DataSaoPaulo2018/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"DataSaoPaulo2018/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"DataSaoPaulo2018/12-VariableSubstitution/#declare-your-variables","text":"First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"DataSaoPaulo2018/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"DataSaoPaulo2018/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"DataSaoPaulo2018/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"DataSaoPaulo2018/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"DataSaoPaulo2018/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"DataSaoPaulo2018/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"DataSaoPaulo2018/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"DataSaoPaulo2018/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"DataSaoPaulo2018/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"DataSaoPaulo2018/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"DataSaoPaulo2018/Materials/","text":"Data Sao Paulo School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Thursday Morning - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Thursday Morning - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday Afternoon - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Thursday Aftenoon - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Friday Morning - Computational Infrastructures - Session 5 Lecture 5 - Persistent Identifiers, Digital Objects, an Architecture for a Data Centeric Internet Friday Morning - Computational Infrastructures - Session 6 Close Out - What to do next? Contact information Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended #DataSaoPaulo. If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Data Sao Paulo 2018 Materials"},{"location":"DataSaoPaulo2018/Materials/#data-sao-paulo-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Data Sao Paulo School Materials"},{"location":"DataSaoPaulo2018/Materials/#thursday-morning-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Thursday Morning - Computational Infrastructures - Session 1"},{"location":"DataSaoPaulo2018/Materials/#thursday-morning-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Thursday Morning - Computational Infrastructures - Session 2"},{"location":"DataSaoPaulo2018/Materials/#thursday-afternoon-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Thursday Afternoon - Computational Infrastructures - Session 3"},{"location":"DataSaoPaulo2018/Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"DataSaoPaulo2018/Materials/#thursday-aftenoon-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Thursday Aftenoon - Computational Infrastructures - Session 4"},{"location":"DataSaoPaulo2018/Materials/#friday-morning-computational-infrastructures-session-5","text":"Lecture 5 - Persistent Identifiers, Digital Objects, an Architecture for a Data Centeric Internet","title":"Friday Morning - Computational Infrastructures - Session 5"},{"location":"DataSaoPaulo2018/Materials/#friday-morning-computational-infrastructures-session-6","text":"Close Out - What to do next?","title":"Friday Morning - Computational Infrastructures - Session 6"},{"location":"DataSaoPaulo2018/Materials/#contact-information","text":"Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended #DataSaoPaulo. If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"DataSaoPaulo2018/School/","text":"Data Sao Paulo 2018 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Friday AM Session A few words on clouds and containers Close out and resources for further collaboration Disclipline spefic tutorials Detailed Schedule Thursday 13-December-2018 Time Discription Instructor 09:30 Welcome and the Landscape of Research Computing Rob Quick 10:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 11:00 Coffee Break 11:30 Profiling your application and finding a home for your workflow Rob Quick 12:00 Exercise - Single and batch submittion with HTCondor Rob Quick 12:30 Lunch 13:30 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:30 Coffee Break 16:00 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 17:15 Exercise - DAGMAN Rob Quick Friday 14-December-2018 Time Discription Instructor 09:30 Clouds, Containers and Parting Words Rob Quick 10:00 Exercise - Complete Earlier Lessons Rob Quick 11:00 Coffee Break 11:30 Becoming Involved and Closing Thoughts Rob Quick 13:30 Lunch Materials Materials Page","title":"Data Sao Paulo 2018 Schedule"},{"location":"DataSaoPaulo2018/School/#data-sao-paulo-2018","text":"","title":"Data Sao Paulo 2018"},{"location":"DataSaoPaulo2018/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"DataSaoPaulo2018/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"DataSaoPaulo2018/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Thursday PM Session"},{"location":"DataSaoPaulo2018/School/#friday-am-session","text":"A few words on clouds and containers Close out and resources for further collaboration Disclipline spefic tutorials","title":"Friday AM Session"},{"location":"DataSaoPaulo2018/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"DataSaoPaulo2018/School/#thursday-13-december-2018","text":"Time Discription Instructor 09:30 Welcome and the Landscape of Research Computing Rob Quick 10:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 11:00 Coffee Break 11:30 Profiling your application and finding a home for your workflow Rob Quick 12:00 Exercise - Single and batch submittion with HTCondor Rob Quick 12:30 Lunch 13:30 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:30 Coffee Break 16:00 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 17:15 Exercise - DAGMAN Rob Quick","title":"Thursday 13-December-2018"},{"location":"DataSaoPaulo2018/School/#friday-14-december-2018","text":"Time Discription Instructor 09:30 Clouds, Containers and Parting Words Rob Quick 10:00 Exercise - Complete Earlier Lessons Rob Quick 11:00 Coffee Break 11:30 Becoming Involved and Closing Thoughts Rob Quick 13:30 Lunch","title":"Friday 14-December-2018"},{"location":"DataSaoPaulo2018/School/#materials","text":"Materials Page","title":"Materials"},{"location":"DataTrieste2018/","text":"Placeholder.","title":"Index"},{"location":"DataTrieste2018/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.12 manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes. Where you will work Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"High Throughput Computing and Condor Introduction"},{"location":"DataTrieste2018/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"DataTrieste2018/01-Introduction/#preliminaries","text":"You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.12 manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"DataTrieste2018/01-Introduction/#which-condor","text":"We will be using Condor 8.6.11, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes.","title":"Which Condor?"},{"location":"DataTrieste2018/01-Introduction/#where-you-will-work","text":"Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"DataTrieste2018/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"The Exercises"},{"location":"DataTrieste2018/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob. Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.10 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.10 (Carbon) Release: 6.10 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"DataTrieste2018/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"DataTrieste2018/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"DataTrieste2018/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob.","title":"Login to the Condor submit computer"},{"location":"DataTrieste2018/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.11 May 14 2018 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux.","title":"Looking at our Condor installation"},{"location":"DataTrieste2018/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.10 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.10 (Carbon) Release: 6.10 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"DataTrieste2018/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"DataTrieste2018/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"DataTrieste2018/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"DataTrieste2018/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"DataTrieste2018/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"DataTrieste2018/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"DataTrieste2018/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"DataTrieste2018/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"DataTrieste2018/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"DataTrieste2018/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"DataTrieste2018/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"DataTrieste2018/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"DataTrieste2018/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"DataTrieste2018/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"DataTrieste2018/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"DataTrieste2018/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"DataTrieste2018/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 28.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 28.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 28.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 28.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 28.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 28.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can also show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"DataTrieste2018/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"DataTrieste2018/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"DataTrieste2018/04-TipsandTricks/#tips-for-condor_q","text":"Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 28.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 28.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 28.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 28.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 28.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 28.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can also show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"DataTrieste2018/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"DataTrieste2018/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"DataTrieste2018/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"DataTrieste2018/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"DataTrieste2018/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"DataTrieste2018/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"DataTrieste2018/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"Running a job with R"},{"location":"DataTrieste2018/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"DataTrieste2018/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"DataTrieste2018/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"DataTrieste2018/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"DataTrieste2018/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"DataTrieste2018/06-RJob/#setup","text":"OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"DataTrieste2018/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"DataTrieste2018/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"The answer"},{"location":"DataTrieste2018/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"DataTrieste2018/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"DataTrieste2018/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"DataTrieste2018/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"DataTrieste2018/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"DataTrieste2018/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot , A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif You will see what looks like an error, but does not affect the image stitching. You can ignore this. montage: unable to read font `(null)' @ error/annotate.c/RenderFreetype/1339. This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Move file to local machine for viewing cp mandle.gif ~/public 1. This time, rather than copying your file to your local system, point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"A brief detour through the Mandlebrot set"},{"location":"DataTrieste2018/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot ,","title":"A brief detour through the Mandlebrot set"},{"location":"DataTrieste2018/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"DataTrieste2018/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif You will see what looks like an error, but does not affect the image stitching. You can ignore this. montage: unable to read font `(null)' @ error/annotate.c/RenderFreetype/1339. This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"DataTrieste2018/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Move file to local machine for viewing cp mandle.gif ~/public 1. This time, rather than copying your file to your local system, point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"Try it!"},{"location":"DataTrieste2018/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here? Challenge What is the scheduler universe? Why does DAGMan use it?","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataTrieste2018/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataTrieste2018/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"DataTrieste2018/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"DataTrieste2018/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"DataTrieste2018/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"DataTrieste2018/09-SimpleDAG/#challenge","text":"What is the scheduler universe? Why does DAGMan use it?","title":"Challenge"},{"location":"DataTrieste2018/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot goatbrot1.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue wrapper_montage.sh Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"DataTrieste2018/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"DataTrieste2018/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"DataTrieste2018/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot","title":"Make your job submission files"},{"location":"DataTrieste2018/10-ComplexDAG/#goatbrot1sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"DataTrieste2018/10-ComplexDAG/#goatbrot2sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"DataTrieste2018/10-ComplexDAG/#goatbrot3sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"DataTrieste2018/10-ComplexDAG/#goatbrot4sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"DataTrieste2018/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"montage.sub"},{"location":"DataTrieste2018/10-ComplexDAG/#wrapper_montagesh","text":"Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif","title":"wrapper_montage.sh"},{"location":"DataTrieste2018/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"DataTrieste2018/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"DataTrieste2018/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"DataTrieste2018/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"DataTrieste2018/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up. Challenge If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Handling a DAG that fails"},{"location":"DataTrieste2018/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"DataTrieste2018/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"DataTrieste2018/11-HandlingFailure/#challenge","text":"If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Challenge"},{"location":"DataTrieste2018/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"DataTrieste2018/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"DataTrieste2018/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"DataTrieste2018/12-VariableSubstitution/#declare-your-variables","text":"First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"DataTrieste2018/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"DataTrieste2018/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"DataTrieste2018/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"DataTrieste2018/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"DataTrieste2018/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"DataTrieste2018/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"DataTrieste2018/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"DataTrieste2018/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"DataTrieste2018/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"DataTrieste2018/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"DataTrieste2018/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"DataTrieste2018/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"DataTrieste2018/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"DataTrieste2018/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"DataTrieste2018/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"DataTrieste2018/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"DataTrieste2018/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"DataTrieste2018/Materials/","text":"Data Trieste School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Thursday Morning - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Thursday Morning - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday Afternoon - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Thursday Aftenoon - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Friday Morning - Computational Infrastructures - Session 5 Introduction and cloud computing (presentation) (30\u2019) Introduction to EGI and the EGI cloud infrastructure (30\u2019) Demo & exercise: Explore EGI services, Explore AppDB (30\u2019) The cloud-based EGI Notebooks service (presentation) (30\u2019) Training materials are available at: https://documents.egi.eu/document/3349 Friday Morning - Computational Infrastructures - Session 6 Intro to hands-on exercise 1 (10\u2019) Hands-on exercise 1 \u2013 Download and plot temperature data Intro to hands-on exercise 2 (10\u2019) Hands-on exercise 2 \u2013 Add rainfall data The future of compute infrastructures in Europe: EOSC (30\u2019) Next steps to become a user (15\u2019) Training materials are available at: https://documents.egi.eu/document/3349 Contact information Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Data Trieste 2018 Materials"},{"location":"DataTrieste2018/Materials/#data-trieste-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Data Trieste School Materials"},{"location":"DataTrieste2018/Materials/#thursday-morning-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Thursday Morning - Computational Infrastructures - Session 1"},{"location":"DataTrieste2018/Materials/#thursday-morning-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Thursday Morning - Computational Infrastructures - Session 2"},{"location":"DataTrieste2018/Materials/#thursday-afternoon-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Thursday Afternoon - Computational Infrastructures - Session 3"},{"location":"DataTrieste2018/Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"DataTrieste2018/Materials/#thursday-aftenoon-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Thursday Aftenoon - Computational Infrastructures - Session 4"},{"location":"DataTrieste2018/Materials/#friday-morning-computational-infrastructures-session-5","text":"Introduction and cloud computing (presentation) (30\u2019) Introduction to EGI and the EGI cloud infrastructure (30\u2019) Demo & exercise: Explore EGI services, Explore AppDB (30\u2019) The cloud-based EGI Notebooks service (presentation) (30\u2019) Training materials are available at: https://documents.egi.eu/document/3349","title":"Friday Morning - Computational Infrastructures - Session 5"},{"location":"DataTrieste2018/Materials/#friday-morning-computational-infrastructures-session-6","text":"Intro to hands-on exercise 1 (10\u2019) Hands-on exercise 1 \u2013 Download and plot temperature data Intro to hands-on exercise 2 (10\u2019) Hands-on exercise 2 \u2013 Add rainfall data The future of compute infrastructures in Europe: EOSC (30\u2019) Next steps to become a user (15\u2019) Training materials are available at: https://documents.egi.eu/document/3349","title":"Friday Morning - Computational Infrastructures - Session 6"},{"location":"DataTrieste2018/Materials/#contact-information","text":"Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"DataTrieste2018/School/","text":"Data Trieste 2018 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Friday AM Session Introduction to cloud environemts Close out and resources for further collaboration Disclipline spefic tutorials Detailed Schedule Thursday 16-August-2018 Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick Friday 17-August-2018 Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos, Guiseppe La Rocca 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos, Guiseppe La Rocca 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos, Guiseppe La Rocca 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos, Guiseppe La Rocca 13:00 Closing Thoughts Rob Quick 13:30 Lunch Materials Materials Page","title":"Data Trieste 2018 Schedule"},{"location":"DataTrieste2018/School/#data-trieste-2018","text":"","title":"Data Trieste 2018"},{"location":"DataTrieste2018/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"DataTrieste2018/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"DataTrieste2018/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Thursday PM Session"},{"location":"DataTrieste2018/School/#friday-am-session","text":"Introduction to cloud environemts Close out and resources for further collaboration Disclipline spefic tutorials","title":"Friday AM Session"},{"location":"DataTrieste2018/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"DataTrieste2018/School/#thursday-16-august-2018","text":"Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick","title":"Thursday 16-August-2018"},{"location":"DataTrieste2018/School/#friday-17-august-2018","text":"Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos, Guiseppe La Rocca 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos, Guiseppe La Rocca 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos, Guiseppe La Rocca 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos, Guiseppe La Rocca 13:00 Closing Thoughts Rob Quick 13:30 Lunch","title":"Friday 17-August-2018"},{"location":"DataTrieste2018/School/#materials","text":"Materials Page","title":"Materials"},{"location":"DataTrieste2019/","text":"Placeholder","title":"Index"},{"location":"DataTrieste2019/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.13 manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 8.6.13, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes. Where you will work Today you will log into training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"High Throughput Computing and Condor Introduction"},{"location":"DataTrieste2019/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"DataTrieste2019/01-Introduction/#preliminaries","text":"You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.13 manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"DataTrieste2019/01-Introduction/#which-condor","text":"We will be using Condor 8.6.13, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes.","title":"Which Condor?"},{"location":"DataTrieste2019/01-Introduction/#where-you-will-work","text":"Today you will log into training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"DataTrieste2019/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"The Exercises"},{"location":"DataTrieste2019/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into user-training.osgconnect.net $ hostname training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob. Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.13 Jan 16 2019 $ $CondorPlatform: X86_64-CentOS_7.6 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/os-release to find out: $ cat /etc/os-release Or you can run: $ hostnamectl Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"DataTrieste2019/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"DataTrieste2019/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"DataTrieste2019/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into user-training.osgconnect.net $ hostname training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob.","title":"Login to the Condor submit computer"},{"location":"DataTrieste2019/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.13 Jan 16 2019 $ $CondorPlatform: X86_64-CentOS_7.6 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux.","title":"Looking at our Condor installation"},{"location":"DataTrieste2019/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/os-release to find out: $ cat /etc/os-release Or you can run: $ hostnamectl Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.11-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor execute/ spool/ spool.q1/ spool.q2/ spool.q3/ spool.q4/ spool.q5/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"DataTrieste2019/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <128.135.158.195:9618?... @ 08/12/18 16:10:58 OWNER BATCH_NAME SUBMITTED DONE RUN IDLE HOLD TOTAL JOB_IDS 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"DataTrieste2019/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"DataTrieste2019/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"DataTrieste2019/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status -pool flock.opensciencegrid.org Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"DataTrieste2019/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"DataTrieste2019/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. Create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"DataTrieste2019/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"DataTrieste2019/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"DataTrieste2019/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. Create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"DataTrieste2019/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"DataTrieste2019/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"ConnectTrain\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"DataTrieste2019/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"DataTrieste2019/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"DataTrieste2019/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"DataTrieste2019/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"DataTrieste2019/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"DataTrieste2019/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"DataTrieste2019/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"DataTrieste2019/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"DataTrieste2019/04-TipsandTricks/#tips-for-condor_q","text":"condor_q can show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l YOUR_JOB_CLUSTER_NUMBER MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many CPUs is the job requesting. (This can be more than one, but for the exercises we will do today it will be 1) $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep RequestCpus RequestCpus = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l YOUR_JOB_CLUSTER_NUMBER | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space. && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files. What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"DataTrieste2019/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm YOUR_JOB_CLUSTER_NUMBER Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"DataTrieste2019/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"DataTrieste2019/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 This script will not be executable without changing the permissions. $ chmod 755 simple.sh Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"DataTrieste2019/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"DataTrieste2019/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 This script will not be executable without changing the permissions. $ chmod 755 simple.sh Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"DataTrieste2019/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"DataTrieste2019/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"Running a job with R"},{"location":"DataTrieste2019/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"DataTrieste2019/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"DataTrieste2019/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"DataTrieste2019/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"DataTrieste2019/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"DataTrieste2019/06-RJob/#setup","text":"OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"DataTrieste2019/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"DataTrieste2019/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"The answer"},{"location":"DataTrieste2019/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"DataTrieste2019/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"DataTrieste2019/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"DataTrieste2019/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"ConnectTrain\" requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"DataTrieste2019/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"DataTrieste2019/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot , A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above, or more simply by moving the file to a web accessible location. cp mandle.gif ~/public point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"A brief detour through the Mandlebrot set"},{"location":"DataTrieste2019/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot ,","title":"A brief detour through the Mandlebrot set"},{"location":"DataTrieste2019/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"DataTrieste2019/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"DataTrieste2019/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above, or more simply by moving the file to a web accessible location. cp mandle.gif ~/public point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"Try it!"},{"location":"DataTrieste2019/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataTrieste2019/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"DataTrieste2019/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"DataTrieste2019/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"DataTrieste2019/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"DataTrieste2019/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"DataTrieste2019/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot goatbrot1.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue wrapper_montage.sh Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"DataTrieste2019/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"DataTrieste2019/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"DataTrieste2019/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot","title":"Make your job submission files"},{"location":"DataTrieste2019/10-ComplexDAG/#goatbrot1sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"DataTrieste2019/10-ComplexDAG/#goatbrot2sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"DataTrieste2019/10-ComplexDAG/#goatbrot3sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"DataTrieste2019/10-ComplexDAG/#goatbrot4sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"DataTrieste2019/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue","title":"montage.sub"},{"location":"DataTrieste2019/10-ComplexDAG/#wrapper_montagesh","text":"Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif","title":"wrapper_montage.sh"},{"location":"DataTrieste2019/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"DataTrieste2019/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"DataTrieste2019/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"DataTrieste2019/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"DataTrieste2019/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up. Challenge If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Handling a DAG that fails"},{"location":"DataTrieste2019/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"DataTrieste2019/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"DataTrieste2019/11-HandlingFailure/#challenge","text":"If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Challenge"},{"location":"DataTrieste2019/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"DataTrieste2019/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"DataTrieste2019/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"DataTrieste2019/12-VariableSubstitution/#declare-your-variables","text":"First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"DataTrieste2019/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"DataTrieste2019/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"DataTrieste2019/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"DataTrieste2019/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"DataTrieste2019/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"DataTrieste2019/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"DataTrieste2019/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"DataTrieste2019/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"DataTrieste2019/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"DataTrieste2019/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"DataTrieste2019/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"DataTrieste2019/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh requirements = (HAS_MODULES =?= true) && (OSGVO_OS_STRING == \"RHEL 6\") && (OpSys == \"LINUX\") && HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"DataTrieste2019/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"DataTrieste2019/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"DataTrieste2019/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"DataTrieste2019/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"DataTrieste2019/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"DataTrieste2019/Materials/","text":"Data Trieste School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Thursday Morning - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Thursday Morning - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday Afternoon - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Thursday Aftenoon - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Bonus Lecture - Digital Object Architectures DOA and RPID Friday Morning - Computational Infrastructures - Session 5 Introduction to Cloud Computing Friday Morning - Computational Infrastructures - Session 6 Close Out - What to do next? Contact information Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research. If you want long\u2010term OSG access, you can go to http://www.osgconnect.net and sign up. Mention you attended Data Trieste 2019 and want to be added to the DOSAR Project.","title":"Data Trieste 2019 Materials"},{"location":"DataTrieste2019/Materials/#data-trieste-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Data Trieste School Materials"},{"location":"DataTrieste2019/Materials/#thursday-morning-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Thursday Morning - Computational Infrastructures - Session 1"},{"location":"DataTrieste2019/Materials/#thursday-morning-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Thursday Morning - Computational Infrastructures - Session 2"},{"location":"DataTrieste2019/Materials/#thursday-afternoon-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Thursday Afternoon - Computational Infrastructures - Session 3"},{"location":"DataTrieste2019/Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"DataTrieste2019/Materials/#thursday-aftenoon-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Thursday Aftenoon - Computational Infrastructures - Session 4"},{"location":"DataTrieste2019/Materials/#bonus-lecture-digital-object-architectures","text":"DOA and RPID","title":"Bonus Lecture - Digital Object Architectures"},{"location":"DataTrieste2019/Materials/#friday-morning-computational-infrastructures-session-5","text":"Introduction to Cloud Computing","title":"Friday Morning - Computational Infrastructures - Session 5"},{"location":"DataTrieste2019/Materials/#friday-morning-computational-infrastructures-session-6","text":"Close Out - What to do next?","title":"Friday Morning - Computational Infrastructures - Session 6"},{"location":"DataTrieste2019/Materials/#contact-information","text":"Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research. If you want long\u2010term OSG access, you can go to http://www.osgconnect.net and sign up. Mention you attended Data Trieste 2019 and want to be added to the DOSAR Project.","title":"Contact information"},{"location":"DataTrieste2019/School/","text":"Data Trieste 2019 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Disclipline spefic tutorials Close out and resources for further collaboration Friday AM Session Introduction to Cloud Computing Detailed Schedule Thursday 15-December-2019 Time Discription Instructor 08:30 Welcome and the Landscape of Research Computing Rob Quick 09:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 09:30 Profiling your application and finding a home for your workflow Rob Quick 10:00 Exercise - Single and batch submittion with HTCondor Rob Quick 10:30 Coffee Break 11:00 Worflows and distributed environments Rob Quick 11:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 13:00 Lunch 14:00 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 15:00 Exercise - DAGMAN Rob Quick 16:00 Coffee Break 16:30 Containers and HTC Wrap-up Rob Quick 17:00 Exercise - Complete Earlier Lessons Rob Quick Friday 14-December-2018 Time Discription Instructor 08:30 Intorduction to Cloud Computing Alessandro Costantini 10:00 Coffee Break 10:30 Introduction to Cloud Computing Alessandro Costantini 13:30 Lunch 14:00 CODATA Simon Hodson 14:30 Close Out Rob Quick Materials Materials Page","title":"Data Trieste 2019 Schedule"},{"location":"DataTrieste2019/School/#data-trieste-2019","text":"","title":"Data Trieste 2019"},{"location":"DataTrieste2019/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"DataTrieste2019/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"DataTrieste2019/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Disclipline spefic tutorials Close out and resources for further collaboration","title":"Thursday PM Session"},{"location":"DataTrieste2019/School/#friday-am-session","text":"Introduction to Cloud Computing","title":"Friday AM Session"},{"location":"DataTrieste2019/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"DataTrieste2019/School/#thursday-15-december-2019","text":"Time Discription Instructor 08:30 Welcome and the Landscape of Research Computing Rob Quick 09:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 09:30 Profiling your application and finding a home for your workflow Rob Quick 10:00 Exercise - Single and batch submittion with HTCondor Rob Quick 10:30 Coffee Break 11:00 Worflows and distributed environments Rob Quick 11:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 13:00 Lunch 14:00 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 15:00 Exercise - DAGMAN Rob Quick 16:00 Coffee Break 16:30 Containers and HTC Wrap-up Rob Quick 17:00 Exercise - Complete Earlier Lessons Rob Quick","title":"Thursday 15-December-2019"},{"location":"DataTrieste2019/School/#friday-14-december-2018","text":"Time Discription Instructor 08:30 Intorduction to Cloud Computing Alessandro Costantini 10:00 Coffee Break 10:30 Introduction to Cloud Computing Alessandro Costantini 13:30 Lunch 14:00 CODATA Simon Hodson 14:30 Close Out Rob Quick","title":"Friday 14-December-2018"},{"location":"DataTrieste2019/School/#materials","text":"Materials Page","title":"Materials"},{"location":"Materials/01-Introduction/","text":"High Throughput Computing and Condor Introduction Preliminaries You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.8 manual . You may enjoy browsing the Condor web page . Which Condor? We will be using Condor 8.2.10, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes. Where you will work Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~ The Exercises Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"High Throughput Computing and Condor Introduction"},{"location":"Materials/01-Introduction/#high-throughput-computing-and-condor-introduction","text":"","title":"High Throughput Computing and Condor Introduction"},{"location":"Materials/01-Introduction/#preliminaries","text":"You will receive login credentials at the beginning of this session. You might want to refer to the online Condor 8.6.8 manual . You may enjoy browsing the Condor web page .","title":"Preliminaries"},{"location":"Materials/01-Introduction/#which-condor","text":"We will be using Condor 8.2.10, which is a recent production version of Condor. Condor has two coexisting types of releases at any given time: stable and development. Condor 8.2.X and 7.8.x are considered stable releases, and you can know they are stable because the second digits (a 2 or a 8 in these cases) are even numbers. In a given stable series, all versions have the same features (for example 7.8.0 and 7.8.1 have the same set of features) and differ only in bug fixes.","title":"Which Condor?"},{"location":"Materials/01-Introduction/#where-you-will-work","text":"Today you will log into user-training.osgconnect.net for all of your exercises: Login on submission node using: $ ssh -XY YOUR_USER_ID@user-training.osgconnect.net You may get a message asking you to establish the authenticity of this connection. Answer \"yes\". When you login to the machine you will be in your \"home directory\". We recommend that you work in this directory as nobody else can modify the files here. You can always return to your home directory by running the command $ cd ~","title":"Where you will work"},{"location":"Materials/01-Introduction/#the-exercises","text":"Throughout the Condor exercises, you will be given a fair amount of guidance. In several spots, there are suggestions for extra exercises to do \"on your own\" or as \"challenges\". Since you aren't being graded, there is no extra credit for doing them, but we encourage you to try them out. If you prefer, you can come back to the extra credit after you've completed the basic exercises. If you simply cruise through the exercises, you'll probably have free time--we encourage you to delve in more deeply. For all of the exercises, we'll assume that you are logged into user-training.osgconnect.net. You should have received your name and password for user-training.osgconnect.net at the beginning of the Computation Infrastructures lecture.","title":"The Exercises"},{"location":"Materials/02-OurJobManager/","text":"Our Condor Installation Objective This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do. Login to the Condor submit computer Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob. Looking at our Condor installation How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.4 Jun 22 2017 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux. Extra Tip: The OS version Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.9 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.9 (Carbon) Release: 6.9 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.4-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor dead.letter execute/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port. condor_q You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:26:20 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed. Extra Tip What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual Double bonus points How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs. condor_status You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing. Extra credit What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Our Condor Installation"},{"location":"Materials/02-OurJobManager/#our-condor-installation","text":"","title":"Our Condor Installation"},{"location":"Materials/02-OurJobManager/#objective","text":"This exercise should help you understand the basics of how Condor is installed, what Condor processes (a.k.a. daemons) are running, and what they do.","title":"Objective"},{"location":"Materials/02-OurJobManager/#login-to-the-condor-submit-computer","text":"Before you start, make sure you are logged into user-training.osgconnect.net $ hostname user-training.osgconnect.net You should have been given your name and password when you arrived this afternoon. If you don't know them, talk to Rob.","title":"Login to the Condor submit computer"},{"location":"Materials/02-OurJobManager/#looking-at-our-condor-installation","text":"How do you know what version of Condor you are using? Try condor_version : $ condor_version $CondorVersion: 8.6.4 Jun 22 2017 $ $CondorPlatform: X86_64-CentOS_6.9 $ Note that the \"CondorPlatform\" reports the type of computer we built it on, not the computer we're running on. It was built on CentOS_6.8, but you might notice that we're running on Scientific Linux 6.8, which is a free clone of Red Hat Enterprise Linux.","title":"Looking at our Condor installation"},{"location":"Materials/02-OurJobManager/#extra-tip-the-os-version","text":"Do you know how to find the OS version? You can usually look in /etc/issue to find out: $ cat /etc/issue Scientific Linux release 6.9 (Carbon) Kernel \\r on an \\m Or you can run: $ lsb_release -a LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch Distributor ID: Scientific Description: Scientific Linux release 6.9 (Carbon) Release: 6.9 Codename: Carbon Where is Condor installed? # Show the location of the condor_q binary $ which condor_q /usr/bin/condor_q # Show which RPM installed Condor $ rpm -q condor condor-8.6.4-1.osg34.el6.x86_64 Condor has some configuration files that it needs to find. They are in the standard location, /etc/condor $ ls /etc/condor condor_config condor_ssh_to_job_sshd_config_template ganglia.d condor_config.local config.d Condor has some directories that it keeps records of jobs in. Remember that each submission computer keeps track of all jobs submitted to it. That's in the local directory: $ condor_config_val -v LOCAL_DIR LOCAL_DIR = /var # at: /etc/condor/condor_config, line 26 # raw: LOCAL_DIR = /var $ ls -CF /var/lib/condor dead.letter execute/ spool/ The spool directory is where Condor keeps the jobs you submit, while the execute directory is where Condor keeps running jobs. Since this is a submission-only computer, it should be empty. Check if Condor is running. Your output will differ slightly, but you should see condor_master with the other Condor daemons listed under it: $ ps auwx --forest | grep condor_ | grep -v grep condor 2299245 0.0 0.1 50972 7348 ? Ss Jul10 0:08 condor_master -pidfile /var/run/condor/condor_master.pid root 2299287 0.0 0.1 25924 5072 ? S Jul10 1:54 \\_ condor_procd -A /var/run/condor/procd_pipe -L /var/log/condor/ProcLog -R 1000000 -S 60 -C 499 condor 2299288 0.0 0.1 50596 7796 ? Ss Jul10 0:16 \\_ condor_shared_port -f condor 2299289 0.0 0.2 70020 9100 ? Ss Jul10 0:13 \\_ condor_collector -f condor 2299290 0.0 0.5 116132 23872 ? Ss Jul10 6:19 \\_ condor_schedd -f condor 2299291 0.0 0.1 51056 7956 ? Ss Jul10 0:59 \\_ condor_negotiator -f For this version of Condor there are four processes running: the condor_master, the condor_schedd, the condor_procd, and condor_schedd. In general, you might see many different Condor processes. Here's a list of the processes: condor_master : This program runs constantly and ensures that all other parts of Condor are running. If they hang or crash, it restarts them. condor_schedd : If this program is running, it allows jobs to be submitted from this computer--that is, your computer is a \"submit machine\". This will advertise jobs to the central manager so that it knows about them. It will contact a condor_startd on other execute machines for each job that needs to be started. condor_procd: This process helps Condor track process (from jobs) that it creates condor_collector: This program is part of the Condor central manager. It collects information about all computers in the pool as well as which users want to run jobs. It is what normally responds to the condor_status command. At the school, it is running on a different computer, and you can figure out which one: Other daemons include: condor_negotiator: This program is part of the Condor central manager. It decides what jobs should be run where. It is run on the same computer as the collector. condor_startd: If this program is running, it allows jobs to be started up on this computer--that is, your computer is an \"execute machine\". This advertises your computer to the central manager so that it knows about this computer. It will start up the jobs that run. condor_shadow: For each job that has been submitted from this computer, there is one condor_shadow running. It will watch over the job as it runs remotely. In some cases it will provide some assistance (see the standard universe later.) You may or may not see any condor_shadow processes running, depending on what is happening on the computer when you try it out. condor_shared_port: Used to assist Condor with networking by allowing multiple Condor processes to share a single network port.","title":"Extra Tip: The OS version"},{"location":"Materials/02-OurJobManager/#condor_q","text":"You can find out what jobs have been submitted on your computer with the condor_q command: $ condor_q -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:26:20 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended The output that you see will be different depending on what jobs are running. Notice what we can see from this: ID : We can see each jobs cluster and process number. For the first job, the cluster is 60256 and the process is 0. OWNER : We can see who owns the job. SUBMITTED : We can see when the job was submitted RUN_TIME : We can see how long the job has been running. ST : We can see what the current state of the job is. I is idle, R is running. PRI : We can see the priority of the job. SIZE : We can see the memory consumption of the job. CMD : We can see the program that is being executed.","title":"condor_q"},{"location":"Materials/02-OurJobManager/#extra-tip","text":"What else can you find out with condor_q? Try any one of: man condor_q condor_q -help condor_q from the online manual","title":"Extra Tip"},{"location":"Materials/02-OurJobManager/#double-bonus-points","text":"How do you use the -constraint or -format options to condor_q ? When would you want them? When would you use the -l option? This might be an easier exercise to try once you submit some jobs.","title":"Double bonus points"},{"location":"Materials/02-OurJobManager/#condor_status","text":"You can find out what computers are in your Condor pool. (A pool is similar to a cluster, but it doesn't have the connotation that all computers are dedicated full-time to computation: some may be desktop computers owned by users.) To look, use condor_status: $ condor_status Name OpSys Arch State Activity LoadAv Mem ActvtyTime slot1@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 1+02:46:31 slot2@amundsen.grid.uchicago.edu LINUX X86_64 Owner Idle 0.000 32768 5+01:05:58 slot1@c2 LINUX X86_64 Unclaimed Idle 0.000 48289 3+10:04:49 slot1@dhcp-10-1-202-3 LINUX X86_64 Unclaimed Idle 0.000 3251 0+08:10:13 slot1_1@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+01:09:46 slot1_2@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 6144 0+00:46:46 slot1_3@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2048 0+00:53:08 slot1_4@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 1024 0+05:48:14 slot1_5@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.000 6144 0+00:16:48 slot1_6@dhcp-10-1-202-3 LINUX X86_64 Claimed Busy 0.990 2816 0+13:16:34 ... Let's look at exactly what you can see: Name : The name of the computer. Sometimes this gets chopped off, like above. OpSys : The operating system, though not at the granularity you may wish: It says \"Linux\" instead of which distribution and version of Linux. Arch : The architecture, such as INTEL or PPC. State : The state is often Claimed (when it is running a Condor job) or Unclaimed (when it is not running a Condor job). It can be in a few other states as well, such as Matched. Activity : This is usually something like Busy or Idle. Sometimes you may see a computer that is Claimed, but no job has yet begun on the computer. Then it is Claimed/Idle. Hopefully this doesn't last very long. LoadAv : The load average on the computer. Mem : The computers memory in megabytes. ActvtyTime : How long the computer has been doing what it's been doing.","title":"condor_status"},{"location":"Materials/02-OurJobManager/#extra-credit","text":"What else can you find out with condor_status? Try any one of: man condor_status condor_status -help condor_status from the online manual Note in particular the options like -master and -schedd . When would these be useful? When would the -l option be useful?","title":"Extra credit"},{"location":"Materials/03-FirstManagedJob/","text":"Submitting your first Condor job Objective The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section. First you need a job Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it. Submitting your job Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"DataSaoPaulo\" Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more? Doing a parameter sweep If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"DataSaoPaulo\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit . On your own Now that you've gotten your feet wet, try a few things on your own. Just one log file There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work? New outputs for each run You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files? Lots of jobs Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.) Challenges If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Submitting your first Condor job"},{"location":"Materials/03-FirstManagedJob/#submitting-your-first-condor-job","text":"","title":"Submitting your first Condor job"},{"location":"Materials/03-FirstManagedJob/#objective","text":"The objective of this exercise to have you run and understand your first Condor job, as well as run small sets of jobs in a parameter sweep. This is an important exercise because it is the basis for everything that follows. If there is anything you don't understand in this exercise, please ask before you continue on. Because this is an important foundation, please seriously consider doing the \u201cOn Your Own\u201d section.","title":"Objective"},{"location":"Materials/03-FirstManagedJob/#first-you-need-a-job","text":"Before you can submit a job to Condor, you need a job. We will quickly write a small program in C. If you aren't an expert C programmer, fear not. We will hold your hand throughout this process. First, because we're going to be compiling some C code, we'll need a compiler. Sometimes this is already loaded onto a system, but in our case, we need to type the following: $ module load gcc Next, create a file called simple.c using your favorite editor. Put it anywhere you like in your home directory. In that file, put the following text. Copy and paste is a good choice: $ mkdir -p ~/condor-test $ cd ~/condor-test Use your preferred text editor to create this C program. (Shown below with nano.) $ nano simple.c Paste in the following C code. #include <stdio.h> int main(int argc, char **argv) { int sleep_time; int input; int failure; if (argc != 3) { printf(\"Usage: simple &lt;sleep-time&gt; &lt;integer&gt;\\n\"); failure = 1; } else { sleep_time = atoi(argv[1]); input = atoi(argv[2]); printf(\"Thinking really hard for %d seconds...\\n\", sleep_time); sleep(sleep_time); printf(\"We calculated: %d\\n\", input * 2); failure = 0; } return failure; } Now compile that program: $ gcc -o simple simple.c $ ls -lh simple -rwxrwxr-x 1 roy roy 595K Jun 20 11:12 simple Finally, run the program and tell it to sleep for four seconds and calculate 10 * 2: $ ./simple 4 10 Thinking really hard for 4 seconds... We calculated: 20 Great! You just had a job run locally on the machine you are logged into (user-training.osgconnect.net). The next step is to run this job on a remote computer - and this is a job you can tell Condor to run! Although it clearly isn't an interesting job, it models some of the aspects of a real scientific program: it takes a while to run and it does a calculation. Think back to the lecture. I said that our first step was to have a job to run. Now we'll work on running it in Condor, and eventually running lots of copies of it.","title":"First you need a job"},{"location":"Materials/03-FirstManagedJob/#submitting-your-job","text":"Now that you have a job, you just have to tell Condor to run it. Put the following text into a file called submit : Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"DataSaoPaulo\" Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Let's examine each of these lines: Universe: The vanilla universe means a plain old job. Later on, we'll encounter some special universes. Executable: The name of your program Arguments: These are the arguments you want. They will be the same arguments we typed above. Log: This is the name of a file where Condor will record information about your job's execution. While it's not required, it is a really good idea to have a log. If something goes wrong you can refer to this log to help figure out the problem. Output: Where Condor should put the standard output from your job. Error: Where Condor should put the standard error from your job. Our job isn't likely to have any, but we'll put it there to be safe. should_transfer_files: Tell Condor that it should transfer files, instead of relying on a shared filesystem. While your home directories (on the glite-tutor computers) are mounted on NFS, you do not have user accounts on the worker nodes, so your jobs cannot access files on NFS. In addition, NFS isn't available between the local UI computers and the remote worker nodes. Therefore we will have Condor transfer files to the remote computer. when_to_transfer_output: A technical detail about when files should be transported back to the computer from which you submitted your job. Don't worry about the details for now. If you're really curious, you can read all the details in the Condor manual . Next, tell Condor to run your job: $ condor_submit submit Submitting job(s). 1 job(s) submitted to cluster 16. Now, watch your job run (insert your username in the command below instead of USER . If you forgot your username use the whoami command. Note that most of your output will be different than the example, the important column to watch is the ST column - the job state): # Note the job state of 'I' means the job is idle - not yet running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:08 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:00 I 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # After some time your job will enter the 'R' state which means it is currently running $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:14 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 2056.0 osguser99 7/19 03:40 0+00:00:02 R 0 0.0 simple 4 10 Total for query: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Total for all users: 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended 1 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended # When your job disappears from the queue that means it completed. $ condor_q YOUR_USER_ID -nobatch -- Schedd: user-training.osgconnect.net : <192.170.227.119:9618?... @ 07/19/17 03:41:21 ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD Total for query: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Total for all users: 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Tip : While you are waiting for your job to run and complete you can check out \"A few tips and tricks\" to learn how to user condor_q more effectively. When my job was done, it was no longer listed. Because I told Condor to log information about my job, I can see what happened: $ cat simple.log 000 (032.000.000) 08/18 15:18:13 Job submitted from host: <10.0.0.252:9645> ... 001 (032.000.000) 08/18 15:18:32 Job executing on host: <172.16.200.1:9250> ... 006 (032.000.000) 08/18 15:18:32 Image size of job updated: 7 0 - MemoryUsage of job (MB) 0 - ResidentSetSize of job (KB) ... 005 (032.000.000) 08/18 15:18:33 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 56 - Run Bytes Sent By Job 7059 - Run Bytes Received By Job 56 - Total Bytes Sent By Job 7059 - Total Bytes Received By Job Partitionable Resources : Usage Request Allocated Cpus : 1 1 Disk (KB) : 15 7 17605109 Memory (MB) : 0 1 1900 That looks good: the job started up quickly, though you will often see slightly slower startups. Condor doesn't optimize for fast job startup, but for high throughput, The job ran for four seconds. Now take a look at the job's output: $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Excellent! We ran our sophisticated scientific job on a Condor pool! We've only run one job though. Can we run more?","title":"Submitting your job"},{"location":"Materials/03-FirstManagedJob/#doing-a-parameter-sweep","text":"If you only ever had to run a single job, you probably wouldn't need Condor. But we would like to have our program calculate a whole set of values for different inputs. How can we do that? Let's change our submit file to look like this: Universe = vanilla Executable = simple +ProjectName = \"DataSaoPaulo\" Arguments = 4 10 Log = simple.$(Process).log Output = simple.$(Process).out Error = simple.$(Process).error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Arguments = 4 11 Queue Arguments = 4 12 Queue There are two important differences to notice here. First, the Log, Output and Error lines have the $(Process) macro in them. This means that the output and error files will be named according to the process number of the job. You'll see what this looks like in a moment. Second, we told Condor to run the same job an extra two times by adding extra Arguments and Queue statements. We are doing a parameter sweep on the values 10, 11, and 12. Let's see what happens: $ condor_submit submit Submitting job(s)... 3 job(s) submitted to cluster 18. $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 I 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 4 idle, 0 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 34.0 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 0 34 34.1 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 1 34 34.2 kagross 8/18 15:28 0+00:00:00 R 0 0.0 simple 2 34 3 jobs; 0 completed, 0 removed, 0 idle, 4 running, 0 held, 0 suspended $ condor_q USER -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended $ ls simple*out simple.0.out simple.1.out simple.2.out simple.out $ cat simple.0.out Thinking really hard for 4 seconds... We calculated: 20 $ cat simple.1.out Thinking really hard for 4 seconds... We calculated: 22 $ cat simple.2.out Thinking really hard for 4 seconds... We calculated: 24 Notice that we had three jobs with the same cluster number, but different process numbers. They have the same cluster number because they were all submitted from the same submit file. When the jobs ran, they created three different output files, each with the desired output. You are now ready to submit lots of jobs! Although this example was simple, Condor has many, many options so you can get a wide variety of behaviors. You can find many of these if you look at the documentation for condor_submit .","title":"Doing a parameter sweep"},{"location":"Materials/03-FirstManagedJob/#on-your-own","text":"Now that you've gotten your feet wet, try a few things on your own.","title":"On your own"},{"location":"Materials/03-FirstManagedJob/#just-one-log-file","text":"There's no reason to have a separate log file for each job. Change your submit file so that it uses a single log file. Does it all still work?","title":"Just one log file"},{"location":"Materials/03-FirstManagedJob/#new-outputs-for-each-run","text":"You might have noticed that the output files were over-written when you re-ran the jobs. (That is, simple.1.out was just re-written.) That was okay for a simple exercise, but it might be very bad if you had wanted to keep around the results. Maybe you changed a parameter or rebuilt your program, and you want to compare the outputs. Just like you used $(Process) , you can also use $(Cluster) . This will be a number from your job ID. For example, it would be 34 from the above example. Change your submit file to use $(Cluster) and $(Process) . If you do two job submissions, will you have separate output files?","title":"New outputs for each run"},{"location":"Materials/03-FirstManagedJob/#lots-of-jobs","text":"Instead of specifying the Arguments multiple times with multiple queue statements, try this: Arguments = $(Process) $(Cluster) queue 10 What does it mean? What happens? Does it work as you expect? (An aside: you might wish to be able to do math, something like $(Process)+1 . Unfortunately, you can't do that.)","title":"Lots of jobs"},{"location":"Materials/03-FirstManagedJob/#challenges","text":"If you have time and feel comfortable with the technical background, try these extra challenges. You'll need to peruse the Condor manual (particularly the manual page for condor_submit ) to find answers. Feel free to ask Rob--he'd love to give you hints! Make another scientific program (probably just modify simple.c) that takes its input from a file. Now submit 3 copies of this program where each input file is in a separate directory. Use the initialdir option described in the manual . This will let you specify a directory for the input to the program. You can run specify the initialdir with $(Process) . You can specify extra files to copy with transfer_input_files . Now you're really learning the basics of running something like a real scientific job! Condor can send you email when a job finishes. How can you control this? You know that your job should never run for more than four hours. If it does, then the job should be killed because there is a problem. How can you tell Condor to do this for you?","title":"Challenges"},{"location":"Materials/04-TipsandTricks/","text":"A few tips and tricks Objective This exercise will teach you a few nifty commands to help you use Condor more easily. Tips for condor_q Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 28.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 28.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 28.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 28.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 28.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 28.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can also show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 24.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 24.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd? Removing jobs If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm . Historical information You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"A few tips and tricks"},{"location":"Materials/04-TipsandTricks/#a-few-tips-and-tricks","text":"","title":"A few tips and tricks"},{"location":"Materials/04-TipsandTricks/#objective","text":"This exercise will teach you a few nifty commands to help you use Condor more easily.","title":"Objective"},{"location":"Materials/04-TipsandTricks/#tips-for-condor_q","text":"Curious where your jobs are running? Use the -run option to see where jobs are running. (Idle jobs are not shown.) $ condor_q -run -nobatch -- Submitter: frontal.cci.ucad.sn : <10.0.0.252:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME HOST(S) 28.44 kagross 8/18 14:51 0+00:00:42 slot1@node2.cci.ucad.sn 28.45 kagross 8/18 14:51 0+00:00:37 slot2@node2.cci.ucad.sn 28.46 kagross 8/18 14:51 0+00:00:32 slot3@node2.cci.ucad.sn 28.47 kagross 8/18 14:51 0+00:00:27 slot4@node2.cci.ucad.sn 28.48 kagross 8/18 14:51 0+00:00:20 slot1@frontal.cci.ucad.sn 28.49 kagross 8/18 14:51 0+00:00:14 slot2@frontal.cci.ucad.sn condor_q can also show you your job ClassAd. Recall back to the lecture and the discussion of ClassAds. For instance, you can look at the ClassAd for a single job: $ condor_q -l 23.0 MaxHosts = 1 User = \"kagross@frontal.cci.ucad.sn\" OnExitHold = false CoreSize = 0 MachineAttrCpus0 = 1 WantRemoteSyscalls = false MyType = \"Job\" Rank = 0.0 CumulativeSuspensionTime = 0 MinHosts = 1 PeriodicHold = false PeriodicRemove = false Err = \"simple.49.error\" ProcId = 49 EnteredCurrentStatus = 1408374244 UserLog = \"/home/kagross/condor-test/s ... output trimmed ... There are some interesting parts you can check out. How many times has this job run? (It might be more than one if there were recoverable errors.) $ condor_q -l 24.0 | grep JobRunCount JobRunCount = 1 Where is the user log for this job? This is helpful when you assist someone else in debugging and they're not sure. $ condor_q -l 24.0 | grep UserLog UserLog = \"/home/kagross/condor-test/simple.47.log\" What are the job's requirements? Condor automatically fills some in for you to make sure your job runs on a reasonable computer in our cluster, but you can override any of these. I've broken the output into multiple lines to explain it to you. $ condor_q -l 23.0 | grep Requirements Requirements =( TARGET.Arch == \"X86_64\" ) <em># Run on a 64-bit computer</em> && ( TARGET.OpSys == \"LINUX\" ) <em># Make sure you run on Linux</em> && ( TARGET.Disk >= RequestDisk ) <em># Make sure the default disk Condor is on has enough disk space.</em> && ( TARGET.Memory >= RequestMemory ) <em># Make sure the computer has enough memory</em> && ( TARGET.HasFileTransfer ) <em># Only run on a computer that can accept your files.</em> What else can you find that's interesting in the ClassAd?","title":"Tips for condor_q"},{"location":"Materials/04-TipsandTricks/#removing-jobs","text":"If you submit a job that you realize has a problem, you can remove it with condor_rm . For example: $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 29.0 roy 6/21 15:23 0+00:00:00 I 0 0.7 simple 60 10 1 jobs; 0 completed, 0 removed, 2 idle, 0 running, 0 held, 0 suspended $ condor_rm 29.0 Job 29.0 marked for removal $ condor_q -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 1 idle, 0 running, 0 held, 0 suspended A few tips: You can remove all of your jobs with the -all option. You can't remove other users jobs. There are fancy options to condor_rm .","title":"Removing jobs"},{"location":"Materials/04-TipsandTricks/#historical-information","text":"You can see information about jobs that completed and are no longer in the queue with the condor_history command. It's rare that you want to see all the jobs, so try looking at jobs for just you: $ condor_history USER For example: $ condor_history kagross 9.9 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9 9 9.8 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 8 9 9.11 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 11 9 9.7 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 7 9 9.5 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 5 9 9.6 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 6 9 9.3 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 3 9 9.2 kagross 7/31 12:44 0+00:00:02 C 7/31 12:44 /home/kagross/simple 2 9 9.1 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 1 9 9.0 kagross 7/31 12:44 0+00:00:03 C 7/31 12:44 /home/kagross/simple 9.4 kagross 7/31 12:44 0+00:00:01 C 7/31 12:44 /home/kagross/simple 4 9 8.0 kagross 7/31 12:42 0+00:00:07 C 7/31 12:42 /home/kagross/simple 4 10 ...","title":"Historical information"},{"location":"Materials/05-ScriptingJob/","text":"Using scripting languages Objective The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission. Challenge Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Using scripting languages"},{"location":"Materials/05-ScriptingJob/#using-scripting-languages","text":"","title":"Using scripting languages"},{"location":"Materials/05-ScriptingJob/#objective","text":"The objective of this exercise is to demonstrate that you can submit jobs to Condor in any language, including scripting languages. At this point, you might be asking yourself, \"This is all well and good, but I don't write programs in C. Can I use other languages?\" Absolutely. Let's assume you like to write program in Bourne shell. Make sure your program begins with #!/bin/sh , and you're good to go. Save this example code into a file called simple.sh using nano or your favorite editor. #!/bin/sh if [ $# -ne 2 ]; then echo \"Usage: simple.sh sleep-time integer\" exit 1 fi echo \"Thinking really hard for $1 seconds..\" sleep $1 answer=$(( $2 * 2 )) echo \"We calculated $answer.\" exit 0 Can you write a submit file to run this job? This should be easy--the script is your Executable , not /bin/sh . You may also want to change the name of your submit.log , submit.out , and submit.err in your submit file to be sure they are not written over when you run this submission.","title":"Objective"},{"location":"Materials/05-ScriptingJob/#challenge","text":"Rewrite this script in Perl or Python (if you're comfortable with one of those languages). Does it still work for you?","title":"Challenge"},{"location":"Materials/06-RJob/","text":"Running a job with R Objective The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise. The Problem Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers... Pros and cons of having your system administrator do it for you PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them. Pros and cons of bringing it along CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes. Setup OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r On your own Write a Condor submit file that will use R to run the demo.r program. You will need to include the following line in your submit file (before the \"queue\" statement) to make sure that Condor looks for a resource that uses OASIS: requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) Make sure you get back the output. Make sure you transfer the program. The answer This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) queue","title":"Running a job with R"},{"location":"Materials/06-RJob/#running-a-job-with-r","text":"","title":"Running a job with R"},{"location":"Materials/06-RJob/#objective","text":"The objective of this exercise is to learn how to run a program that depends on a larger run-time environment that isn't already available on your cluster. The run-time environment refers to all the hardware and software resources available on the machine where your job is running. We will focus on the available software in this exercise.","title":"Objective"},{"location":"Materials/06-RJob/#the-problem","text":"Sometimes you want to run a program that depends on a larger run-time environment. For example, perhaps you wrote your program in Perl, but there is no Perl installed on the cluster. (That's an unlikely example, intended just to give you feel for what I mean by \"run-time environment\".) This is a common problem distributed computing users encounter. For example, many people like to use Matlab (or its open-source cousin, Octave ) or R for doing calculations. These require a fair amount of run-time environment to run the programs you write. What do you do if they aren't installed? There are at least two possibilities: Ask your kindly system administrator to install it for you on all of the computers you might run on. Bring the environment (Such as Octave or R) along with your job. Before you read any further, please stop for a moment, and think about the tradeoffs between these two methodologies. They both have benefits and drawbacks. Why would you choose each of them? Why not? Here are some of my answers...","title":"The Problem"},{"location":"Materials/06-RJob/#pros-and-cons-of-having-your-system-administrator-do-it-for-you","text":"PRO - It's a lot easier for you. PRO - You have to transfer less data with each job. CON - You have to wait for the system administrator to install them. CON - If you want upgrades (or downgrades), you have to ask again and wait for them.","title":"Pros and cons of having your system administrator do it for you"},{"location":"Materials/06-RJob/#pros-and-cons-of-bringing-it-along","text":"CON - It's more complex for you. CON - You have to transfer the application and data with each job (or have a job that pre-stages it for you.) PRO - You are in complete control: when you need a tweak, an upgrade, or a downgrade, you can make it happen. Clearly, there is a choice here. I'd like to enable you to be able to bring along your run-time environment with you. In my experience, if you are capable of bringing it with you, you can take advantage of more computers: you don't have to wait for someone to build and install the environment for you. We'd like to demonstrate how to install one run-time environment called R . Don't worry if you have no experience with it: neither do I. It's a package for doing math and statistics and it lets you write programs in the R language. (Actually, it's the S language. Who chose names like this that are hard to Google?) We've built a minimal version of R that won't do graphical output, which makes it much less interesting, but it's good enough for our purposes.","title":"Pros and cons of bringing it along"},{"location":"Materials/06-RJob/#setup","text":"OSG's implementation of CVMFS is called OASIS and we will be using this to gain access to R. $ source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/5.6.2/init/bash Let's look at what software is available: $ module avail And then we load the R module: $ module load R You'll need an R program. After hours of combing the internet coding, I present to you my first R program. Save it in a file called demo.r using nano or your favorite editor: len <- 100 fibvals <- numeric(len) fibvals[1] <- 1 fibvals[2] <- 1 for (i in 3:len) { fibvals[i] <- fibvals[i-1]+fibvals[i-2] } print(\"First 100 Fibonacci numbers:\") for (i in 1:len) { print(fibvals[i], digits = 21) } print(\"Number of possible combinations of cards in a 52 card deck:\") comb <- factorial(52) print(comb, digits = 21) This program prints the first 100 Fibonacci numbers . These are the numbers that show up in the weirdest places, like pineapples and sunflowers. It's a sequence of numbers beginning with 0, 1, 1, 2, 3, 5, 8... where each successive number is the sum of the previous two numbers. It also prints 52 factorial, which is the number of possible combinations of a standard 52-card deck (not including the jokers, of course). R is a bit fussy about where it's been installed on disk, so I had to write a wrapper program so it will happily run wherever it lands in our cluster. I could make you work it out, but that seems unfair. Save this program in run-r.sh . If you're curious about exactly why it's needed, ask Rob. There are two important parts to it, and you should know what they are, at least conceptually because this is the magic you would need to do for any run-time environment you want to bring along. Load the R environment using module . In general you might have to do more work. Invoke R, using whatever magic is needed. In our case, I set up some environment variables and invoke the right executable. #!/bin/sh -x if [ $# -ne 1 ]; then echo \"Usage: run-r \" exit 1 fi # Step 1: Set up our environment, the R module. source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load R module load libgfortran # Step 2, Invoke R with the proper environment R --slave --vanilla < $1 You could easily execute this on OSG Connect locally by making the shell script executable and executing it. $ chmod 755 run-r.sh $ ./run-r.sh demo.r","title":"Setup"},{"location":"Materials/06-RJob/#on-your-own","text":"Write a Condor submit file that will use R to run the demo.r program. You will need to include the following line in your submit file (before the \"queue\" statement) to make sure that Condor looks for a resource that uses OASIS: requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) Make sure you get back the output. Make sure you transfer the program.","title":"On your own"},{"location":"Materials/06-RJob/#the-answer","text":"This should be easy for you now, but if it's not, here's the answer, just in case. universe = vanilla executable = run-r.sh +ProjectName = \"ConnectTrain\" arguments = demo.r transfer_input_files = demo.r log = R.log.$(Cluster).$(Process) error = R.err.$(Cluster).$(Process) output = R.out.$(Cluster).$(Process) requirements = (HAS_CVMFS_oasis_opensciencegrid_org =?= TRUE) queue","title":"The answer"},{"location":"Materials/07-WorkingwithFiles/","text":"Working with data in files Objective The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job. Data Movement So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"DataSaoPaulo\" ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay? On your own Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"Working with data in files"},{"location":"Materials/07-WorkingwithFiles/#working-with-data-in-files","text":"","title":"Working with data in files"},{"location":"Materials/07-WorkingwithFiles/#objective","text":"The objective of this exercise is to teach you how to provide files as input to your job, and get output as files back from your job.","title":"Objective"},{"location":"Materials/07-WorkingwithFiles/#data-movement","text":"So far, we've done really simple examples where the entire input to the program is just on the command-line. What do you do if you have data files to deal with? Let's walk through a short example. First, let's make a program, call it analyze.sh that analyzes a text file that it is provided on the command-line. #!/bin/sh if [ $# -ne 1 ]; then echo \"Usage: analyze.sh <filename>\" exit 1 fi echo \"About to do a deep analysis of $1...\" echo \"First, we convert it to all upper case (see $1.upper)\" tr \"[:lower:]\" \"[:upper:]\" < $1 > $1.upper echo \"Next, we find the 10 most common words (see $1.10)\" cat $1 | tr \"[:upper:]\" \"[:lower:]\" | tr -cs \"[:alpha:]\" \"\\n\" | sort | uniq -c | sort --key=1,7 -n -r | head -10 > $1.10 sleep 5 You also need a file to analyze. Put the following text into a file called gettysburg . Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. Our submit file looks nearly identical to what we had before, except for the one bolded line that specifies the data file to transfer. Put the following text into a file called submit.speech . Universe = vanilla Executable = analyze.sh Output = analyze.out Error = analyze.error Log = analyze.log Arguments = gettysburg +ProjectName = \"DataSaoPaulo\" ShouldTransferFiles = Yes WhenToTransferOutput = ON_EXIT transfer_input_files = gettysburg queue Notice that you just had to specify the input files and not the output files. Condor will automatically transfer back any new files, so you don't have to worry about it. Nifty, huh? Now run the job. $ condor_submit submit.speech Submitting job(s). 1 job(s) submitted to cluster 37. $ ls -lh gettys* -rw-rw-r--. 1 kagross kagross 1.5K Aug 18 15:41 gettysburg -rw-r--r--. 1 kagross kagross 120 Aug 18 15:42 gettysburg.10 -rw-r--r--. 1 kagross kagross 1.5K Aug 18 15:42 gettysburg.upper You got your files! Check them out--do they look okay?","title":"Data Movement"},{"location":"Materials/07-WorkingwithFiles/#on-your-own","text":"Create several text files, then submit jobs (preferably from a single submit file) to analyze each of them. If you're at a loss to create some text files, here are a few for you. Walkthrough of the Original Text Game \"Adventure\" The Story of Captain Midnight The Universal Geek Code Tao of Programming Instead of downloading these files and transferring them directly, can you change your transfer-input-files to use a URL and have Condor download them for you? Give this a try.","title":"On your own"},{"location":"Materials/08-Mandlebrot/","text":"A brief detour through the Mandlebrot set Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot , A simple invocation of goatbrot You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall. Dividing goatbrot up The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together. Try it! Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Move file to local machine for viewing cp mandle.gif ~/public 1. This time, rather than copying your file to your local system, point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"A brief detour through the Mandlebrot set"},{"location":"Materials/08-Mandlebrot/#a-brief-detour-through-the-mandlebrot-set","text":"Before we dive into a more complicated DAG, let's get a more interesting job. I'm tired of this lazy, sleepy job that only does trivial mathematics. Let's make pretty pictures! We have a small program that draws pictures of the Mandlebrot set. You can read about the Mandlebrot set on Wikipedia , or you can simply appreciate the pretty pictures. It's a fractal. We have a simple program that can draw the Mandlebrot set. It's called goatbrot ,","title":"A brief detour through the Mandlebrot set"},{"location":"Materials/08-Mandlebrot/#a-simple-invocation-of-goatbrot","text":"You can generate the Mandlebrot set with two simple commands. Generate a PPM image of the Mandlebrot set: $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c 0,0 -w 3 -s 1000,1000 Add the Fast Fourier Transform and ImageMagick packages: $ module load fftw $ module load imagemagick Convert it to a JPEG image and write into your home directory: $ convert tile_000000_000000.ppm ~/mandle.gif Open a new terminal window and move the file to local machine for viewing (substitute your username in place of USER ): $ scp USER@user-training.osgconnect.net:/home/USER/mandle.gif ./ Point Browser at the file URL: firefox ./mandle.gif The goatbroat program takes several parameters. Let's break them down: -i 1000 The number of iterations. Bigger numbers generate more accurate images but are slower to run. -o tile_000000_000000.ppm The output file to generate. -c 0,0 The center point of the image. Here it is the point (0,0). -w 3 The width of the image. Here is 3. -s 1000,1000 The size of the final image. Here we generate a picture that is 1000 pixels wide and 1000 pixels tall.","title":"A simple invocation of goatbrot"},{"location":"Materials/08-Mandlebrot/#dividing-goatbrot-up","text":"The Mandlebrot set can take a while to create, particularly if you make the iterations large or the image size large. What if we broke the creation of the image into multiple invocations then stitched them together? Once we do that, we can run the each goatbroat in parallel in our cluster. Here's an example you can run by hand. Run goatbroat 4 times : $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000000.ppm -c -0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000000_000001.ppm -c 0.75,0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000000.ppm -c -0.75,-0.75 -w 1.5 -s 500,500 $ /stash/user/rquick/public/goatbrot-master/goatbrot -i 1000 -o tile_000001_000001.ppm -c 0.75,-0.75 -w 1.5 -s 500,500 Stitch them together : $ montage tile_000000_000000.ppm tile_000000_000001.ppm tile_000001_000000.ppm tile_000001_000001.ppm -mode Concatenate -tile 2x2 ~/mandle.gif This will produce the same image as above. We broke the image space into a 2 by 2 grid and ran goatbrot on each section of the grid. The montage program simply stitches the files together.","title":"Dividing goatbrot up"},{"location":"Materials/08-Mandlebrot/#try-it","text":"Run the commands above, make sure you can create the Mandlebrot image. When you create the image, you might wonder how you can view it. The same way we did above. 1. Move file to local machine for viewing cp mandle.gif ~/public 1. This time, rather than copying your file to your local system, point your browser at the stash web server: http://stash.osgconnect.net/~USER","title":"Try it!"},{"location":"Materials/09-SimpleDAG/","text":"Coordinating set of jobs: A simple DAG Objective The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job. What is DAGMan? Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual . Submitting a simple DAG We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.* On your own Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here? Challenge What is the scheduler universe? Why does DAGMan use it?","title":"Coordinating set of jobs: A simple DAG"},{"location":"Materials/09-SimpleDAG/#coordinating-set-of-jobs-a-simple-dag","text":"","title":"Coordinating set of jobs: A simple DAG"},{"location":"Materials/09-SimpleDAG/#objective","text":"The objective of this exercise is to learn the very basics of running a set of jobs, where our set is just one job.","title":"Objective"},{"location":"Materials/09-SimpleDAG/#what-is-dagman","text":"Your tutorial leader will introduce you to DAGMan and DAGs. In short, DAGMan lets you submit complex sequences of jobs as long as they can be expressed as a directed acylic graph. For example, you may wish to run a large parameter sweep but before the sweep run you need to prepare your data. After the sweep runs, you need to collate the results. DAGMan has many abilities such as throttling jobs, recovery from failures, and more. More information about DAGMan can be found at in the Condor manual .","title":"What is DAGMan?"},{"location":"Materials/09-SimpleDAG/#submitting-a-simple-dag","text":"We're going to go back to the \"simple\" example that we did first. (The one with the job that slept and multiplied a number by 2.) Make sure that you have a submit file has only one queue command in it, as when we first wrote it. And we will just run vanilla universe jobs for now, though we could equally well run standard universe jobs. Universe = vanilla Executable = simple Arguments = 4 10 +ProjectName = \"ConnectTrain\" Log = simple.log Output = simple.out Error = simple.error should_transfer_files = YES when_to_transfer_output = ON_EXIT Queue Make sure you've built the simple program. If you need to, go back to the instructions for your first job to do it again. We are going to get a bit more sophisticated in submitting our jobs now. Let's have three windows open. In one window you'll submit the job. In the second you will watch the queue. And in the third you will watch what DAGMan does. First we will create the most minimal DAG that can be created: a DAG with just one node. Put the text below into a file named simple.dag . job simple submit In your first window, submit the DAG: $ condor_submit_dag simple.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : simple.dag.condor.sub Log of DAGMan debugging messages : simple.dag.dagman.out Log of Condor library output : simple.dag.lib.out Log of Condor library error messages : simple.dag.lib.err Log of the life of condor_dagman itself : simple.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 61. ----------------------------------------------------------------------- In the second window, watch the queue (press Ctrl+C when finished watching to kill this process): $ watch -n 10 condor_q USER -nobatch -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:00:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:01:25 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:00 I 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 1 idle, 1 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 61.0 roy 6/21 22:51 0+00:03:47 R 0 0.3 condor_dagman 62.0 roy 6/21 22:51 0+00:00:03 R 0 0.7 simple 4 10 2 jobs; 0 completed, 0 removed, 0 idle, 2 running, 0 held, 0 suspended -- Submitter: osg-ss-submit.chtc.wisc.edu : <128.104.100.55:9618?sock=28867_10e4_2> : osg-ss-submit.chtc.wisc.edu ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Ctrl-C In the third window, watch what DAGMan does: $ tail -f --lines=500 simple.dag.dagman.out 6/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 ** condor_scheduniv_exec.61.0 (CONDOR_DAGMAN) STARTING UP 06/21/12 22:51:13 ** /usr/bin/condor_dagman 06/21/12 22:51:13 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/21/12 22:51:13 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/21/12 22:51:13 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/21/12 22:51:13 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/21/12 22:51:13 ** PID = 5812 06/21/12 22:51:13 ** Log last touched 6/21 22:51:00 06/21/12 22:51:13 ****************************************************** 06/21/12 22:51:13 Using config source: /etc/condor/condor_config 06/21/12 22:51:13 Using local config sources: 06/21/12 22:51:13 /etc/condor/config.d/00-chtc-global.conf 06/21/12 22:51:13 /etc/condor/config.d/01-chtc-submit.conf 06/21/12 22:51:13 /etc/condor/config.d/02-chtc-flocking.conf 06/21/12 22:51:13 /etc/condor/config.d/03-chtc-jobrouter.conf 06/21/12 22:51:13 /etc/condor/config.d/04-chtc-blacklist.conf 06/21/12 22:51:13 /etc/condor/config.d/99-osg-ss-group.conf 06/21/12 22:51:13 /etc/condor/config.d/99-roy-extras.conf 06/21/12 22:51:13 /etc/condor/condor_config.local 06/21/12 22:51:13 DaemonCore: command socket at <128.104.100.55:60417> 06/21/12 22:51:13 DaemonCore: private command socket at <128.104.100.55:60417> 06/21/12 22:51:13 Setting maximum accepts per cycle 8. 06/21/12 22:51:13 DAGMAN_USE_STRICT setting: 0 06/21/12 22:51:13 DAGMAN_VERBOSITY setting: 3 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_SIZE setting: 5242880 06/21/12 22:51:13 DAGMAN_DEBUG_CACHE_ENABLE setting: False 06/21/12 22:51:13 DAGMAN_SUBMIT_DELAY setting: 0 06/21/12 22:51:13 DAGMAN_MAX_SUBMIT_ATTEMPTS setting: 6 06/21/12 22:51:13 DAGMAN_STARTUP_CYCLE_DETECT setting: False 06/21/12 22:51:13 DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: 5 06/21/12 22:51:13 DAGMAN_USER_LOG_SCAN_INTERVAL setting: 5 06/21/12 22:51:13 allow_events (DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS) setting: 114 06/21/12 22:51:13 DAGMAN_RETRY_SUBMIT_FIRST setting: True 06/21/12 22:51:13 DAGMAN_RETRY_NODE_FIRST setting: False 06/21/12 22:51:13 DAGMAN_MAX_JOBS_IDLE setting: 0 06/21/12 22:51:13 DAGMAN_MAX_JOBS_SUBMITTED setting: 0 06/21/12 22:51:15 DAGMAN_MAX_PRE_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_MAX_POST_SCRIPTS setting: 0 06/21/12 22:51:15 DAGMAN_ALLOW_LOG_ERROR setting: False 06/21/12 22:51:15 DAGMAN_MUNGE_NODE_NAMES setting: True 06/21/12 22:51:15 DAGMAN_PROHIBIT_MULTI_JOBS setting: False 06/21/12 22:51:15 DAGMAN_SUBMIT_DEPTH_FIRST setting: False 06/21/12 22:51:15 DAGMAN_ALWAYS_RUN_POST setting: True 06/21/12 22:51:15 DAGMAN_ABORT_DUPLICATES setting: True 06/21/12 22:51:15 DAGMAN_ABORT_ON_SCARY_SUBMIT setting: True 06/21/12 22:51:15 DAGMAN_PENDING_REPORT_INTERVAL setting: 600 06/21/12 22:51:15 DAGMAN_AUTO_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_MAX_RESCUE_NUM setting: 100 06/21/12 22:51:15 DAGMAN_WRITE_PARTIAL_RESCUE setting: True 06/21/12 22:51:15 DAGMAN_DEFAULT_NODE_LOG setting: null 06/21/12 22:51:15 DAGMAN_GENERATE_SUBDAG_SUBMITS setting: True 06/21/12 22:51:15 ALL_DEBUG setting: 06/21/12 22:51:15 DAGMAN_DEBUG setting: 06/21/12 22:51:15 argv[0] == \"condor_scheduniv_exec.61.0\" 06/21/12 22:51:15 argv[1] == \"-Lockfile\" 06/21/12 22:51:15 argv[2] == \"simple.dag.lock\" 06/21/12 22:51:15 argv[3] == \"-AutoRescue\" 06/21/12 22:51:15 argv[4] == \"1\" 06/21/12 22:51:15 argv[5] == \"-DoRescueFrom\" 06/21/12 22:51:15 argv[6] == \"0\" 06/21/12 22:51:15 argv[7] == \"-Dag\" 06/21/12 22:51:15 argv[8] == \"simple.dag\" 06/21/12 22:51:15 argv[9] == \"-CsdVersion\" 06/21/12 22:51:15 argv[10] == \"$CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $\" 06/21/12 22:51:15 argv[11] == \"-Force\" 06/21/12 22:51:15 argv[12] == \"-Dagman\" 06/21/12 22:51:15 argv[13] == \"/usr/bin/condor_dagman\" 06/21/12 22:51:15 Default node log file is: </home/roy/condor/simple.dag.nodes.log> 06/21/12 22:51:15 DAG Lockfile will be written to simple.dag.lock 06/21/12 22:51:15 DAG Input file is simple.dag 06/21/12 22:51:15 Parsing 1 dagfiles 06/21/12 22:51:15 Parsing simple.dag ... 06/21/12 22:51:15 Dag contains 1 total jobs 06/21/12 22:51:15 Sleeping for 12 seconds to ensure ProcessId uniqueness 06/21/12 22:51:27 Bootstrapping... 06/21/12 22:51:27 Number of pre-completed nodes: 0 06/21/12 22:51:27 Registering condor_event_timer... 06/21/12 22:51:28 Sleeping for one second for log file consistency 06/21/12 22:51:29 MultiLogFiles: truncating log file /home/roy/condor/simple.log 06/21/12 22:51:29 Submitting Condor Node Simple job(s)... # Here's where the job is submitted 06/21/12 22:51:29 submitting: condor_submit -a dag_node_name' '=' 'Simple -a +DAGManJobId' '=' '61 -a DAGManJobId' '=' '61 -a submit_event_notes' '=' 'DAG' 'Node:' 'Simple -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"\" submit 06/21/12 22:51:30 From submit: Submitting job(s). 06/21/12 22:51:30 From submit: 1 job(s) submitted to cluster 62. 06/21/12 22:51:30 assigned Condor ID (62.0.0) 06/21/12 22:51:30 Just submitted 1 job this cycle... 06/21/12 22:51:30 Currently monitoring 1 Condor log file(s) 06/21/12 22:51:30 Event: ULOG_SUBMIT for Condor Node Simple (62.0.0) 06/21/12 22:51:30 Number of idle job procs: 1 06/21/12 22:51:30 Of 1 nodes total: 06/21/12 22:51:30 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:51:30 === === === === === === === 06/21/12 22:51:30 0 0 1 0 0 0 0 06/21/12 22:51:30 0 job proc(s) currently held 06/21/12 22:55:05 Currently monitoring 1 Condor log file(s) # Here's where DAGMan noticed that the job is running 06/21/12 22:55:05 Event: ULOG_EXECUTE for Condor Node Simple (62.0.0) 06/21/12 22:55:05 Number of idle job procs: 0 06/21/12 22:55:10 Currently monitoring 1 Condor log file(s) 06/21/12 22:55:10 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Currently monitoring 1 Condor log file(s) 06/21/12 22:56:05 Event: ULOG_IMAGE_SIZE for Condor Node Simple (62.0.0) # Here's where DAGMan noticed that the job finished. 06/21/12 22:56:05 Event: ULOG_JOB_TERMINATED for Condor Node Simple (62.0.0) 06/21/12 22:56:05 Node Simple job proc (62.0.0) completed successfully. 06/21/12 22:56:05 Node Simple job completed 06/21/12 22:56:05 Number of idle job procs: 0 06/21/12 22:56:05 Of 1 nodes total: 06/21/12 22:56:05 Done Pre Queued Post Ready Un-Ready Failed 06/21/12 22:56:05 === === === === === === === 06/21/12 22:56:05 1 0 0 0 0 0 0 06/21/12 22:56:05 0 job proc(s) currently held # Here's where DAGMan noticed that all the work is done. 06/21/12 22:56:05 All jobs Completed! 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/21/12 22:56:05 Note: 0 total job deferrals because of node category throttles 06/21/12 22:56:05 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/21/12 22:56:05 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/21/12 22:56:05 **** condor_scheduniv_exec.61.0 (condor_DAGMAN) pid 5812 EXITING WITH STATUS 0 Now verify your results: $ cat simple.log 000 (062.000.000) 06/21 22:51:30 Job submitted from host: <128.104.100.55:9618?sock=28867_10e4_2> DAG Node: Simple ... 001 (062.000.000) 06/21 22:55:00 Job executing on host: <128.104.58.36:46761> ... 006 (062.000.000) 06/21 22:55:09 Image size of job updated: 750 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 006 (062.000.000) 06/21 22:56:00 Image size of job updated: 780 3 - MemoryUsage of job (MB) 2324 - ResidentSetSize of job (KB) ... 005 (062.000.000) 06/21 22:56:00 Job terminated. (1) Normal termination (return value 0) Usr 0 00:00:00, Sys 0 00:00:00 - Run Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Run Local Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Remote Usage Usr 0 00:00:00, Sys 0 00:00:00 - Total Local Usage 57 - Run Bytes Sent By Job 608490 - Run Bytes Received By Job 57 - Total Bytes Sent By Job 608490 - Total Bytes Received By Job Partitionable Resources : Usage Request Cpus : 1 Disk (KB) : 750 750 Memory (MB) : 3 3 ... $ cat simple.out Thinking really hard for 4 seconds... We calculated: 20 Looking at DAGMan's various files, we see that DAGMan itself ran as a Condor job (specifically, a scheduler universe job). $ ls simple.dag.* simple.dag.condor.sub simple.dag.dagman.log simple.dag.dagman.out simple.dag.lib.err simple.dag.lib.out $ cat simple.dag.condor.sub # Filename: simple.dag.condor.sub # Generated by condor_submit_dag simple.dag universe = scheduler executable = /usr/bin/condor_dagman getenv = True output = simple.dag.lib.out error = simple.dag.lib.err log = simple.dag.dagman.log remove_kill_sig = SIGUSR1 +OtherJobRemoveRequirements = \"DAGManJobId == $(cluster)\" # Note: default on_exit_remove expression: # ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) # attempts to ensure that DAGMan is automatically # requeued by the schedd if it exits abnormally or # is killed (e.g., during a reboot). on_exit_remove = ( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2)) copy_to_spool = False arguments = \"-f -l . -Lockfile simple.dag.lock -AutoRescue 1 -DoRescueFrom 0 -Dag simple.dag -CsdVersion $CondorVersion:' '7.7.6' 'Apr' '16' '2012' 'BuildID:' '34175' 'PRE-RELEASE-UWCS' '$ -Force -Dagman /usr/bin/condor_dagman\" environment = _CONDOR_DAGMAN_LOG=simple.dag.dagman.out;_CONDOR_MAX_DAGMAN_LOG=0 queue Clean up some of these files: $ rm simple.dag.*","title":"Submitting a simple DAG"},{"location":"Materials/09-SimpleDAG/#on-your-own","text":"Why does DAGman run as a Condor job? Look at the submit file for DAGMan: what does on_exit_remove do? Why is this here?","title":"On your own"},{"location":"Materials/09-SimpleDAG/#challenge","text":"What is the scheduler universe? Why does DAGMan use it?","title":"Challenge"},{"location":"Materials/10-ComplexDAG/","text":"A More Complex DAG Objective The objective of this exercise is to run a real set of jobs with DAGMan. Make your job submission files We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot goatbrot1.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot2.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot3.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue goatbrot4.sub executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue montage.sub You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue wrapper_montage.sh Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif Make your DAG In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job? Running the DAG Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. ----------------------------------------------------------------------- Watch your DAG Watch with condor_q: $ watch -n 10 condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.* On your own. Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"A More Complex DAG"},{"location":"Materials/10-ComplexDAG/#a-more-complex-dag","text":"","title":"A More Complex DAG"},{"location":"Materials/10-ComplexDAG/#objective","text":"The objective of this exercise is to run a real set of jobs with DAGMan.","title":"Objective"},{"location":"Materials/10-ComplexDAG/#make-your-job-submission-files","text":"We'll run our goatbrot example. If you didn't read about it yet, please do so now . We are going to make a DAG with four simultaneous jobs ( goatbrot ) and one final node to stitch them together ( montage ). This means we have five jobs. We're going to run goatbrot with more iterations (100,000) so it will take longer to run. You can create your five jobs. The goatbrot jobs very similar to each other, but they have slightly different parameters (arguments) and output files. I have placed the goatbrot executable in my public directory: /stash/user/rquick/public/goatbrot-master/goatbrot","title":"Make your job submission files"},{"location":"Materials/10-ComplexDAG/#goatbrot1sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,0.75 -w 1.5 -s 500,500 -o tile_0_0.ppm log = goatbrot.log output = goatbrot.out.0.0 error = goatbrot.err.0.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot1.sub"},{"location":"Materials/10-ComplexDAG/#goatbrot2sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,0.75 -w 1.5 -s 500,500 -o tile_0_1.ppm log = goatbrot.log output = goatbrot.out.0.1 error = goatbrot.err.0.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot2.sub"},{"location":"Materials/10-ComplexDAG/#goatbrot3sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c -0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_0.ppm log = goatbrot.log output = goatbrot.out.1.0 error = goatbrot.err.1.0 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot3.sub"},{"location":"Materials/10-ComplexDAG/#goatbrot4sub","text":"executable = /stash/user/rquick/public/goatbrot-master/goatbrot arguments = -i 100000 -c 0.75,-0.75 -w 1.5 -s 500,500 -o tile_1_1.ppm log = goatbrot.log output = goatbrot.out.1.1 error = goatbrot.err.1.1 should_transfer_files = YES when_to_transfer_output = ONEXIT queue","title":"goatbrot4.sub"},{"location":"Materials/10-ComplexDAG/#montagesub","text":"You should notice a few things about the montage submission file: The transfer_input_files statement refers to the files created by the other jobs. We do not transfer the montage program because it is on OASIS. universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = montage.log queue","title":"montage.sub"},{"location":"Materials/10-ComplexDAG/#wrapper_montagesh","text":"Because we are using OASIS, we will need to create a wrapper script to load the ImageMagick module so that we can use it to create the montage. Put the following lines into wrapper_montage.sh : source /cvmfs/oasis.opensciencegrid.org/osg/modules/lmod/current/init/bash module load imagemagick montage tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.gif","title":"wrapper_montage.sh"},{"location":"Materials/10-ComplexDAG/#make-your-dag","text":"In a file called goatbrot.dag , you have your DAG specification: JOB g1 goatbrot1.sub JOB g2 goatbrot2.sub JOB g3 goatbrot3.sub JOB g4 goatbrot4.sub JOB montage montage.sub PARENT g1 g2 g3 g4 CHILD montage Ask yourself: do you know how we ensure that all the goatbrot commands can run simultaneously and all of them will complete before we run the montage job?","title":"Make your DAG"},{"location":"Materials/10-ComplexDAG/#running-the-dag","text":"Submit your DAG: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 71. -----------------------------------------------------------------------","title":"Running the DAG"},{"location":"Materials/10-ComplexDAG/#watch-your-dag","text":"Watch with condor_q: $ watch -n 10 condor_q USER -nobatch Here we see DAGMan running: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan has submitted the goatbrot jobs, but they haven't started running yet (note that the I status stands for Idle): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:10 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:00 I 0 0.0 goatbrot -i 100000 6 jobs; 0 completed, 0 removed, 4 idle, 2 running, 0 held, 0 suspended They're running! (All four jobs are in state R - running) -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:15 R 0 0.3 condor_dagman 69.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 70.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 71.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:05 R 0 0.0 goatbrot -i 100000 5 jobs; 0 completed, 0 removed, 0 idle, 5 running, 0 held, 0 suspended Two of the jobs have finished, while the others are still running (remember that completed jobs disappear from condor_q output): -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:20 R 0 0.3 condor_dagman 71.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 72.0 kagross 8/19 11:38 0+00:00:10 R 0 0.0 goatbrot -i 100000 3 jobs; 0 completed, 0 removed, 0 idle, 3 running, 0 held, 0 suspended They finished, but DAGMan hasn't noticed yet. It only checks periodically: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:00:30 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended DAGMan submitted and ran the montage job. It ran so fast I didn't capture it running. DAGMan will finish up soon -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 68.0 kagross 8/19 11:38 0+00:01:01 R 0 0.3 condor_dagman 1 jobs; 0 completed, 0 removed, 0 idle, 1 running, 0 held, 0 suspended Now it's all done: -- Submitter: kagross@frontal.cci.ucad.sn : <172.16.200.1:9645> : frontal.cci.ucad.sn ID OWNER SUBMITTED RUN_TIME ST PRI SIZE CMD 0 jobs; 0 completed, 0 removed, 0 idle, 0 running, 0 held, 0 suspended Examine your results. For some reason, goatbrot prints everything to stderr, not stdout. $ cat goatbrot.err.0.0 Complex image: Center: -0.75 + 0.75i Width: 1.5 Height: 1.5 Upper Left: -1.5 + 1.5i Lower Right: 0 + 0i Output image: Filename: tile_0_0.ppm Width, Height: 500, 500 Theme: beej Antialiased: no Mandelbrot: Max Iterations: 100000 Continuous: no Goatbrot: Multithreading: not supported in this build Completed: 100.0% Examine your log files ( goatbrot.log and montage.log ) and DAGMan output file ( goatbrot.dag.dagman.out ). Do they look as you expect? Can you see the progress of the DAG in the DAGMan output file? Does your final Mandlebrot image ( mandle.gif ) look correct? To view it we can use Stash. $ cp mandle.gif ~/stash/public/ And now you can go to http://stash.osgconnect.net/~USER . You will see mandle.gif listed. You can click on it to view it. Clean up your results. Be careful about deleting the goatbrot.dag. files, you do not want to delete the goatbrot.dag file, just goatbrot.dag. . $ rm goatbrot.dag.* $ rm goatbrot.out.* $ rm goatbrot.err.*","title":"Watch your DAG"},{"location":"Materials/10-ComplexDAG/#on-your-own","text":"Re-run your DAG. When jobs are running, try condor_q -dag . What does it do differently? Challenge, if you have time: Make a bigger DAG by making more tiles in the same area.","title":"On your own."},{"location":"Materials/11-HandlingFailure/","text":"Handling a DAG that fails Objective The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up. Challenge If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Handling a DAG that fails"},{"location":"Materials/11-HandlingFailure/#handling-a-dag-that-fails","text":"","title":"Handling a DAG that fails"},{"location":"Materials/11-HandlingFailure/#objective","text":"The objective of this exercise is to help you learn how DAGMan deals with job failures. DAGMan is built to help you recover from such failures. DAGMan can handle a situation where some of the nodes in a DAG fails. DAGMan will run as many nodes as possible, then create a \"rescue DAG\". A rescue DAG allows you to fix the problem and then resume your job where it left off. Recall that DAGMan decides that a jobs fails if its exit code is non-zero. Let's modify our montage job so that it fails. Work in the same directory where you did the last DAG. Edit montage.sub to add a -h to the arguments. It will look like this (the change is bolded): universe = vanilla executable = wrapper_montage.sh arguments = -h tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Submit the DAG again: $ condor_submit_dag goatbrot.dag ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 77. ----------------------------------------------------------------------- Use watch to watch the jobs until they finish. In a separate window, use tail --lines=500 -f goatbrot.dag.dagman.out to watch what DAGMan does. 06/22/12 17:57:41 Setting maximum accepts per cycle 8. 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 ** condor_scheduniv_exec.77.0 (CONDOR_DAGMAN) STARTING UP 06/22/12 17:57:41 ** /usr/bin/condor_dagman 06/22/12 17:57:41 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/22/12 17:57:41 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/22/12 17:57:41 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/22/12 17:57:41 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/22/12 17:57:41 ** PID = 26867 06/22/12 17:57:41 ** Log last touched time unavailable (No such file or directory) 06/22/12 17:57:41 ****************************************************** 06/22/12 17:57:41 Using config source: /etc/condor/condor_config 06/22/12 17:57:41 Using local config sources: 06/22/12 17:57:41 /etc/condor/config.d/00-chtc-global.conf 06/22/12 17:57:41 /etc/condor/config.d/01-chtc-submit.conf 06/22/12 17:57:41 /etc/condor/config.d/02-chtc-flocking.conf 06/22/12 17:57:41 /etc/condor/config.d/03-chtc-jobrouter.conf 06/22/12 17:57:41 /etc/condor/config.d/04-chtc-blacklist.conf 06/22/12 17:57:41 /etc/condor/config.d/99-osg-ss-group.conf 06/22/12 17:57:41 /etc/condor/config.d/99-roy-extras.conf 06/22/12 17:57:41 /etc/condor/condor_config.local ... output trimmed ... 06/22/12 18:08:42 Event: ULOG_EXECUTE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Event: ULOG_IMAGE_SIZE for Condor Node montage (82.0.0) 06/22/12 18:08:42 Event: ULOG_JOB_TERMINATED for Condor Node montage (82.0.0) 06/22/12 18:08:42 Node montage job proc (82.0.0) failed with status 1. 06/22/12 18:08:42 Number of idle job procs: 0 06/22/12 18:08:42 Of 5 nodes total: 06/22/12 18:08:42 Done Pre Queued Post Ready Un-Ready Failed 06/22/12 18:08:42 === === === === === === === 06/22/12 18:08:42 4 0 0 0 0 0 1 06/22/12 18:08:42 0 job proc(s) currently held 06/22/12 18:08:42 Aborting DAG... 06/22/12 18:08:42 Writing Rescue DAG to goatbrot.dag.rescue001... 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/22/12 18:08:42 Note: 0 total job deferrals because of node category throttles 06/22/12 18:08:42 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/22/12 18:08:42 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/22/12 18:08:42 **** condor_scheduniv_exec.77.0 (condor_DAGMAN) pid 26867 EXITING WITH STATUS 1 DAGMan notices that one of the jobs failed because it's exit code was non-zero. DAGMan ran as much of the DAG as possible and logged enough information to continue the run when the situation is resolved. Do you see the part where it wrote the resuce DAG? Look at the rescue DAG. It's called a partial DAG: it indicates what part of the DAG has already been completed. When you re-submit the original DAG, DAGMan will notice the rescue DAG and use it in combination with the original DAG. (The rescue DAG used to be the full DAG with nodes marked as done and you would ask DAGMan to run the new rescue DAG. For your simplicity DAGMan lets you resubmit the original DAG and it reads both files.) $ cat goatbrot.dag.rescue001 # Rescue DAG file, created after running # the goatbrot.dag DAG file # Created 6/22/2012 23:08:42 UTC # Rescue DAG version: 2.0.1 (partial) # # Total number of Nodes: 5 # Nodes premarked DONE: 4 # Nodes that failed: 1 # montage,<ENDLIST> DONE g1 DONE g2 DONE g3 DONE g4 From the comment near the top, we know that the montage node failed. Let's fix it by getting rid of the offending -h argument. Change montage.sub to look like: universe = vanilla executable = wrapper_montage.sh arguments = tile_0_0.ppm tile_0_1.ppm tile_1_0.ppm tile_1_1.ppm -mode Concatenate -tile 2x2 mandle.jpg should_transfer_files = YES when_to_transfer_output = ONEXIT transfer_input_files = tile_0_0.ppm,tile_0_1.ppm,tile_1_0.ppm,tile_1_1.ppm transfer_executable = true output = montage.out error = montage.err log = goat.log queue Now we can re-submit our original DAG and DAGMan will pick up where it left off. It will automatically notice the rescue DAG If you didn't fix the problem, DAGMan would generate another rescue DAG. $ condor_submit_dag goatbrot.dag Running rescue DAG 1 ----------------------------------------------------------------------- File for submitting this DAG to Condor : goatbrot.dag.condor.sub Log of DAGMan debugging messages : goatbrot.dag.dagman.out Log of Condor library output : goatbrot.dag.lib.out Log of Condor library error messages : goatbrot.dag.lib.err Log of the life of condor_dagman itself : goatbrot.dag.dagman.log Submitting job(s). 1 job(s) submitted to cluster 83. ----------------------------------------------------------------------- $ tail -f goatbrot.dag.dagman.out 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 ** condor_scheduniv_exec.83.0 (CONDOR_DAGMAN) STARTING UP 06/23/12 11:30:53 ** /usr/bin/condor_dagman 06/23/12 11:30:53 ** SubsystemInfo: name=DAGMAN type=DAGMAN(10) class=DAEMON(1) 06/23/12 11:30:53 ** Configuration: subsystem:DAGMAN local:<NONE> class:DAEMON 06/23/12 11:30:53 ** $CondorVersion: 7.7.6 Apr 16 2012 BuildID: 34175 PRE-RELEASE-UWCS $ 06/23/12 11:30:53 ** $CondorPlatform: x86_64_rhap_5.7 $ 06/23/12 11:30:53 ** PID = 28576 06/23/12 11:30:53 ** Log last touched 6/22 18:08:42 06/23/12 11:30:53 ****************************************************** 06/23/12 11:30:53 Using config source: /etc/condor/condor_config ... Here is where DAGMAN notices that there is a rescue DAG: 06/23/12 11:30:53 Parsing 1 dagfiles 06/23/12 11:30:53 Parsing goatbrot.dag ... 06/23/12 11:30:53 Found rescue DAG number 1; running goatbrot.dag.rescue001 in combination with normal DAG file 06/23/12 11:30:53 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 06/23/12 11:30:53 USING RESCUE DAG goatbrot.dag.rescue001 06/23/12 11:30:53 Dag contains 5 total jobs Shortly thereafter it sees that four jobs have already finished: 06/23/12 11:31:05 Bootstrapping... 06/23/12 11:31:05 Number of pre-completed nodes: 4 06/23/12 11:31:05 Registering condor_event_timer... 06/23/12 11:31:06 Sleeping for one second for log file consistency 06/23/12 11:31:07 MultiLogFiles: truncating log file /home/roy/condor/goatbrot/montage.log Here is where DAGMan resubmits the montage job and waits for it to complete: 06/23/12 11:31:07 Submitting Condor Node montage job(s)... 06/23/12 11:31:07 submitting: condor_submit -a dag_node_name' '=' 'montage -a +DAGManJobId' '=' '83 -a DAGManJobId' '=' '83 -a submit_event_notes' '=' 'DAG' 'Node:' 'montage -a DAG_STATUS' '=' '0 -a FAILED_COUNT' '=' '0 -a +DAGParentNodeNames' '=' '\"g1,g2,g3,g4\" montage.sub 06/23/12 11:31:07 From submit: Submitting job(s). 06/23/12 11:31:07 From submit: 1 job(s) submitted to cluster 84. 06/23/12 11:31:07 assigned Condor ID (84.0.0) 06/23/12 11:31:07 Just submitted 1 job this cycle... 06/23/12 11:31:07 Currently monitoring 1 Condor log file(s) 06/23/12 11:31:07 Event: ULOG_SUBMIT for Condor Node montage (84.0.0) 06/23/12 11:31:07 Number of idle job procs: 1 06/23/12 11:31:07 Of 5 nodes total: 06/23/12 11:31:07 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:31:07 === === === === === === === 06/23/12 11:31:07 4 0 1 0 0 0 0 06/23/12 11:31:07 0 job proc(s) currently held 06/23/12 11:40:22 Currently monitoring 1 Condor log file(s) 06/23/12 11:40:22 Event: ULOG_EXECUTE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Event: ULOG_IMAGE_SIZE for Condor Node montage (84.0.0) 06/23/12 11:40:22 Event: ULOG_JOB_TERMINATED for Condor Node montage (84.0.0) This is where the montage finished: 06/23/12 11:40:22 Node montage job proc (84.0.0) completed successfully. 06/23/12 11:40:22 Node montage job completed 06/23/12 11:40:22 Number of idle job procs: 0 06/23/12 11:40:22 Of 5 nodes total: 06/23/12 11:40:22 Done Pre Queued Post Ready Un-Ready Failed 06/23/12 11:40:22 === === === === === === === 06/23/12 11:40:22 5 0 0 0 0 0 0 06/23/12 11:40:22 0 job proc(s) currently held And here DAGMan decides that the work is all done: 06/23/12 11:40:22 All jobs Completed! 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxJobs limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of -MaxIdle limit (0) 06/23/12 11:40:22 Note: 0 total job deferrals because of node category throttles 06/23/12 11:40:22 Note: 0 total PRE script deferrals because of -MaxPre limit (0) 06/23/12 11:40:22 Note: 0 total POST script deferrals because of -MaxPost limit (0) 06/23/12 11:40:22 **** condor_scheduniv_exec.83.0 (condor_DAGMAN) pid 28576 EXITING WITH STATUS 0</pre> Success! Now go ahead and clean up.","title":"Objective"},{"location":"Materials/11-HandlingFailure/#challenge","text":"If you have time, add an extra node to the DAG. Copy our original simple program, but make it exit with a 1 instead of a 0. DAGMan would consider this a failure, but you'll tell DAGMan that it's really a success. This is reasonable--many real world programs use a variety of return codes, and you might need to help DAGMan distinguish success from failure. Write a POST script that checks the return value. Check the Condor manual to see how to describe your post script.","title":"Challenge"},{"location":"Materials/12-VariableSubstitution/","text":"Simpler DAGs with variable substitutions Objective The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can! Declare your variables First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Simpler DAGs with variable substitutions"},{"location":"Materials/12-VariableSubstitution/#simpler-dags-with-variable-substitutions","text":"","title":"Simpler DAGs with variable substitutions"},{"location":"Materials/12-VariableSubstitution/#objective","text":"The objective of this exercise is to help you write simpler DAGs by using variable substitutions in your submit files. If you look at the DAG we made, you might find it a bit tedious because each goatbrot job has a separate Condor submit file. They're nearly identical except for a couple of parameters. Can we make it simpler? Yes, we can!","title":"Objective"},{"location":"Materials/12-VariableSubstitution/#declare-your-variables","text":"First you need to declare your variables in your submit file. Make one submit file for all of your goatbrot jobs. Here's what it looks like. Call it goatbrot.sub : executable = /tmp/goatbrot-master/goatbrot arguments = -i 100000 -c $(CENTERX),$(CENTERY) -w 1.5 -s 500,500 -o tile_$(TILEY)_$(TILEX).ppm log = goatbrot.log output = goatbrot.out.$(TILEY).$(TILEX) error = goatbrot.err.$(TILEY).$(TILEX) should_transfer_files = YES when_to_transfer_output = ONEXIT queue Then you need to change your DAG to use VARS for variable substitution. Here's what one of the jobs would look like: JOB g1 goatbrot.sub VARS g1 CENTERX=\"-0.75\" VARS g1 CENTERY=\"0.75\" VARS g1 TILEX=\"0\" VARS g1 TILEY=\"0\" Edit your DAG similarly for all of your goatbrot jobs. If you need help, check the Condor manual for for a description of how to use VARS . What happens?","title":"Declare your variables"},{"location":"Materials/13-DisciplineTutorials/","text":"Follow your Interest Exercises During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial. Bioinformatics Tutorials Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast Statistical Tutorials Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld Molecular Dynamics Tutorials NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs High Energy Physics Tutorials Calculate ntuples with root $ tutorial root Programming Tutorials Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift Advanced HTC Concepts Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Follow your Interest Exercises"},{"location":"Materials/13-DisciplineTutorials/#follow-your-interest-exercises","text":"During this portion of the exercise you'll be able to find a tutorial that fits your interest and play with submission on OSG Connect. You can see the available tutorials by logging on to login.osgconnect.net and running the following command: $ tutorial list Some suggestions to get you started: The details of the tutorials will be in the README.md files with each tutorial.","title":"Follow your Interest Exercises"},{"location":"Materials/13-DisciplineTutorials/#bioinformatics-tutorials","text":"Molecule Docking $ tutorial AutoDockVina Genetic Sequence Analysis $ tutorial blast","title":"Bioinformatics Tutorials"},{"location":"Materials/13-DisciplineTutorials/#statistical-tutorials","text":"Use R to calculate Pi $ tutorial R Matlab $ tutorial matlab-HelloWorld","title":"Statistical Tutorials"},{"location":"Materials/13-DisciplineTutorials/#molecular-dynamics-tutorials","text":"NAMD Simulations $ tutorial namd GROMACS $ tutorial gromacs","title":"Molecular Dynamics Tutorials"},{"location":"Materials/13-DisciplineTutorials/#high-energy-physics-tutorials","text":"Calculate ntuples with root $ tutorial root","title":"High Energy Physics Tutorials"},{"location":"Materials/13-DisciplineTutorials/#programming-tutorials","text":"Python Virtual Environment $ tutorial python-virtualenv SWIFT Parallel Programming $ tutorial swift","title":"Programming Tutorials"},{"location":"Materials/13-DisciplineTutorials/#advanced-htc-concepts","text":"Pegasus Workflows $ tutorial pegasus Scaling on the Open Science Grid Pegasus Workflows $ tutorial scaling Feel free to explore the OSG Connect Tutorials on your own.","title":"Advanced HTC Concepts"},{"location":"Materials/14-Containers/","text":"Singularity Containers in OSG Objective Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG Default Image The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources Exloring Images on the Submit Host Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest Custom Images OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup. Creating a Custom Image If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file. Distributing Custom Images Via CVMFS In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly. Source Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Singularity Containers in OSG"},{"location":"Materials/14-Containers/#singularity-containers-in-osg","text":"","title":"Singularity Containers in OSG"},{"location":"Materials/14-Containers/#objective","text":"Singularity is a container system to allow users full control over their enviroment. You can create your own container image which your job will execute within, or choose from a set of pre-defined images. For more information about Singularity, please see: Singularity Home Page The following talk describes Singularity for scientific computing: Singularity Talk Derek Weitzel wrote a blog post about Singularity on OSG, which provides a good introduction on how to create images and run them, but does not cover all the functionality described further down: Singularity on the OSG","title":"Objective"},{"location":"Materials/14-Containers/#default-image","text":"The default setup is to auto load an image on sites which support Singularity. Every job which lands on such a site, will have a container started just for that job, and then run within that container. Most users will not even know that their jobs are run within a container, but it will provide them with a consistent environment across OSG sites. The current default container is based on EL6 and contains a basic set of tools expected from OSG compute nodes. The image is loaded from /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6 and the definition file is available in GitHub https://github.com/opensciencegrid/osgvo-el6 . If you want to steer a job to run on a default Singularity instance, use HAS_SINGULARITY == True in the job requirements. For example: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue To instruct the system to load a different image, use the +SingularityImage attribute in your job submit file. For example, to run your job under EL7: universe = vanilla executable = job.sh Requirements = HAS_SINGULARITY == TRUE +SingularityImage = \"/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7\" +SingularityBindCVMFS = True should_transfer_files = IF_NEEDED when_to_transfer_output = ON_EXIT output = out error = err log = log queue The user support team maintains a set of images. These contain a basic set of tools and libraries. The images are are: Image Location Defintion Description EL 6 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest GitHub A basic Enterprise Linux (CentOS) 6 based image. This is currently our default image EL 7 /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest GitHub A basic Enterprise Linux (CentOS) 7 based image. Ubuntu Xenial /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest GitHub A good image if you prefer Ubuntu over EL flavors TensorFlow /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow:latest GitHub Base on the TensorFlow base image, with a few OSG package added TensorFlow GPU /cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest GitHub Used for running TensorFlow jobs on OSG GPU resources","title":"Default Image"},{"location":"Materials/14-Containers/#exloring-images-on-the-submit-host","text":"Images can be explored interactively on the submit hosts by starting it in \"shell\" mode. The recommended command line, similar to how containers are started for jobs, is: singularity shell \\ --home $PWD:/srv \\ --pwd /srv \\ --bind /cvmfs \\ --scratch /var/tmp \\ --scratch /tmp \\ --contain --ipc --pid \\ /cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-ubuntu-xenial:latest","title":"Exloring Images on the Submit Host"},{"location":"Materials/14-Containers/#custom-images","text":"OSG Connect provides tooling for users to create, publish and load custom images. This is useful if your job requires some very specific software setup.","title":"Custom Images"},{"location":"Materials/14-Containers/#creating-a-custom-image","text":"If you want to use an image you have created yourself, the image should be defined as a Docker image and published in the Docker Hub . The reason we use Docker as a source image repository is that it allows us to easily import the images into our own distribution system (see below). To get started, create a Docker user, sign in to the hub, and create a new repository. You will end up with an identifier of the namespace/repository_name format. Create an image locally using a Dockerfile and the docker build . We suggest you base the image on one of the provided OSG images. For example, if you want to base the image on our Ubuntu Xenial image, first download the Dockerfile from the GitHub repository . Edit the Dockerfile to fit your requirements. Then build the image with tag matching your Docker Hub repository: docker build -t namespace/repository_name . Once you have a successful build, push it to the hub: docker push namespace/repository_name Then register the image as described in the next section. If you prefer, you can base you image on images not already published by OSG, but if you do this, we recommend that you as one of the steps create the /cvmfs directory. This will enable the container to access tools and data published on /cvmfs. In your Dockerfile , add: # required directories RUN mkdir -p /cvmfs See one of the provided image defintions for a full example. If you do not want /cvmfs mounted in the container, please add +SingularityBindCVMFS = False to your job submit file.","title":"Creating a Custom Image"},{"location":"Materials/14-Containers/#distributing-custom-images-via-cvmfs","text":"In order to be able to efficiently distribute the container images to a large of distributed compute hosts, OSG has choosen to host the images under CVMFS . Any image publically available in Docker can be included for automatic syncing into the CVMFS repository. The result is an unpacked image under /cvmfs/singularity.opensciencegrid.org/ To get your images included, please either create a git pull request against docker_images.txt in the cvmfs-singularity-sync repository , or contact user-support@opensciencegrid.org and we can help you. Once your image has been registered, new versions pushed to Docker Hub will automatically be detected and CVMFS will be updated accordingly.","title":"Distributing Custom Images Via CVMFS"},{"location":"Materials/14-Containers/#source","text":"Paged sourced from https://support.opensciencegrid.org/support/solutions/articles/12000024676-singularity-containers.","title":"Source"},{"location":"Materials/DSP_DT2017/","text":"Data Sao Paulo 2017 High-Level Curriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Brief Introduction to clouds and containers Friday AM Session UNESP Presentation and Tour Close out and resources for further collaboration Detailed Schedule Thursday 20-July 2017 Time Discription Instructor 09:30 Welcome and the Landscape of Research Computing Rob Quick 10:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 11:00 Coffee Break 11:30 Profiling your application and finding a home for your workflow Rob Quick 12:00 Exercise - Single and batch submittion with HTCondor Rob Quick 12:30 Lunch 13:30 Worflows and distributed environments Rob Quick 14:00 Exercise - Accessing the Open Science Grid and DAGMan Rob Quick 15:00 Coffee Break 15:30 A Brief Introduction to Clouds and Containers Rob Quick 16:00 Exercise - Using Containers on OSG and Discpline Specfic Tutorials Rob Quick Friday 21-July 2017 Time Discription Instructor 09:30 Introduction and Tour of Grid UNESP Facilities Raphael C\u00f3be 11:00 Coffee Break 11:30 Wrap of Computational Infrastructures Rob Quick 12:00 Closing Thoughts Rob Quick 12:30 Adjourn Materials Materials Page","title":"Data Sao Paulo Schedule"},{"location":"Materials/DSP_DT2017/#data-sao-paulo-2017","text":"","title":"Data Sao Paulo 2017"},{"location":"Materials/DSP_DT2017/#high-level-curriculum-overview","text":"","title":"High-Level Curriculum Overview"},{"location":"Materials/DSP_DT2017/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"Materials/DSP_DT2017/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Brief Introduction to clouds and containers","title":"Thursday PM Session"},{"location":"Materials/DSP_DT2017/#friday-am-session","text":"UNESP Presentation and Tour Close out and resources for further collaboration","title":"Friday AM Session"},{"location":"Materials/DSP_DT2017/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"Materials/DSP_DT2017/#thursday-20-july-2017","text":"Time Discription Instructor 09:30 Welcome and the Landscape of Research Computing Rob Quick 10:00 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 11:00 Coffee Break 11:30 Profiling your application and finding a home for your workflow Rob Quick 12:00 Exercise - Single and batch submittion with HTCondor Rob Quick 12:30 Lunch 13:30 Worflows and distributed environments Rob Quick 14:00 Exercise - Accessing the Open Science Grid and DAGMan Rob Quick 15:00 Coffee Break 15:30 A Brief Introduction to Clouds and Containers Rob Quick 16:00 Exercise - Using Containers on OSG and Discpline Specfic Tutorials Rob Quick","title":"Thursday 20-July 2017"},{"location":"Materials/DSP_DT2017/#friday-21-july-2017","text":"Time Discription Instructor 09:30 Introduction and Tour of Grid UNESP Facilities Raphael C\u00f3be 11:00 Coffee Break 11:30 Wrap of Computational Infrastructures Rob Quick 12:00 Closing Thoughts Rob Quick 12:30 Adjourn","title":"Friday 21-July 2017"},{"location":"Materials/DSP_DT2017/#materials","text":"Materials Page","title":"Materials"},{"location":"Materials/DSP_Materials/","text":"Data Sao Paulo School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Thursday Morning - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Thursday Morning - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday Afternoon - Computational Infrastructures - Session 3 Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG Challenge Exercises on DAGs Handling jobs that fail Variable Substitution Thursday Aftenoon - Computational Infrastructures - Session 4 Lecture 4 - Slides Discipline specific tutorials Containers with Singularity Friday Morning - Computational Infrastructures - Session 5 Grid UNESP Presentation Friday Morning - Computational Infrastructures - Session 6 Computational Infrastructures Wrap Up - Slides Contact information Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Data Sao Paulo Materials"},{"location":"Materials/DSP_Materials/#data-sao-paulo-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Data Sao Paulo School Materials"},{"location":"Materials/DSP_Materials/#thursday-morning-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Thursday Morning - Computational Infrastructures - Session 1"},{"location":"Materials/DSP_Materials/#thursday-morning-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Thursday Morning - Computational Infrastructures - Session 2"},{"location":"Materials/DSP_Materials/#thursday-afternoon-computational-infrastructures-session-3","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set Coordinating sets of jobs: A simple DAG A more complex DAG","title":"Thursday Afternoon - Computational Infrastructures - Session 3"},{"location":"Materials/DSP_Materials/#challenge-exercises-on-dags","text":"Handling jobs that fail Variable Substitution","title":"Challenge Exercises on DAGs"},{"location":"Materials/DSP_Materials/#thursday-aftenoon-computational-infrastructures-session-4","text":"Lecture 4 - Slides Discipline specific tutorials Containers with Singularity","title":"Thursday Aftenoon - Computational Infrastructures - Session 4"},{"location":"Materials/DSP_Materials/#friday-morning-computational-infrastructures-session-5","text":"Grid UNESP Presentation","title":"Friday Morning - Computational Infrastructures - Session 5"},{"location":"Materials/DSP_Materials/#friday-morning-computational-infrastructures-session-6","text":"Computational Infrastructures Wrap Up - Slides","title":"Friday Morning - Computational Infrastructures - Session 6"},{"location":"Materials/DSP_Materials/#contact-information","text":"Rob Quick - rquick@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"Materials/PH_DT2017/","text":"Polar Hackathon 2017 High-Level Cirriculum Overview Wednesday Session 1 Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Wednesday Session 2 Workflows and distributed environments Thursday Sessions Containers Jetstream demo and discussion Materials Materials Page","title":"Polar Hackathon Schedule"},{"location":"Materials/PH_DT2017/#polar-hackathon-2017","text":"","title":"Polar Hackathon 2017"},{"location":"Materials/PH_DT2017/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"Materials/PH_DT2017/#wednesday-session-1","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Wednesday Session 1"},{"location":"Materials/PH_DT2017/#wednesday-session-2","text":"Workflows and distributed environments","title":"Wednesday Session 2"},{"location":"Materials/PH_DT2017/#thursday-sessions","text":"Containers Jetstream demo and discussion","title":"Thursday Sessions"},{"location":"Materials/PH_DT2017/#materials","text":"Materials Page","title":"Materials"},{"location":"Materials/PH_Materials/","text":"Polar Hackathon School Materials We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information. Wednesday - Computational Infrastructures - Session 1 Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks Wednesday - Computational Infrastructures - Session 2 Lecture 2 - Slides A scripting job An R Job Working with file I/O Thursday - Computational Infrastructures - Session 3 Lecture 4 - Slides Containers with Singularity Jetstream Demo Possible if Time - Workflows Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow Extras Discipline specific tutorials Contact information Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Polar Hackathon Materials"},{"location":"Materials/PH_Materials/#polar-hackathon-school-materials","text":"We will be using OSG Connect for this set of sessions. Please visit http://www.osgconnect.net for more information.","title":"Polar Hackathon School Materials"},{"location":"Materials/PH_Materials/#wednesday-computational-infrastructures-session-1","text":"Welcome and Introduction - The Landscape of Academic Research Computing Lecture 1 - Slides Introduction Our Job Manager (HTCondor) Your First Managed Job A few tips and tricks","title":"Wednesday - Computational Infrastructures - Session 1"},{"location":"Materials/PH_Materials/#wednesday-computational-infrastructures-session-2","text":"Lecture 2 - Slides A scripting job An R Job Working with file I/O","title":"Wednesday - Computational Infrastructures - Session 2"},{"location":"Materials/PH_Materials/#thursday-computational-infrastructures-session-3","text":"Lecture 4 - Slides Containers with Singularity Jetstream Demo","title":"Thursday - Computational Infrastructures - Session 3"},{"location":"Materials/PH_Materials/#possible-if-time-workflows","text":"Lecture 3 - Slides A brief detour through the Mandlebrot set A more complex workflow","title":"Possible if Time - Workflows"},{"location":"Materials/PH_Materials/#extras","text":"Discipline specific tutorials","title":"Extras"},{"location":"Materials/PH_Materials/#contact-information","text":"Kyle Gross \u2013 kagross@iu.edu http://www.osgconnect.net/ DOSAR: Distributed Organization for Scientific and Academic Research http://www.dosar.org/ You are welcome to join our bi\u2010weekly video (Vidyo) meetings. Send request to be added to DOSAR email list to Prof. Greenwood: greenw@phys.latech.edu reference you attended the Polar Hackathon If you want long\u2010term grid access, you can go to http://www.osgconnect.net and sign up","title":"Contact information"},{"location":"Materials/School/","text":"Data Trieste 2017 High-Level Cirriculum Overview Thursday AM Session Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC) Thursday PM Session Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs) Friday AM Session Introduction to cloud environemts Close out and resources for further collaboration Detailed Schedule Thursday 20-July 2017 Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick Friday 21-July 2017 Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch Materials Materials Page","title":"Data Trieste 2017"},{"location":"Materials/School/#data-trieste-2017","text":"","title":"Data Trieste 2017"},{"location":"Materials/School/#high-level-cirriculum-overview","text":"","title":"High-Level Cirriculum Overview"},{"location":"Materials/School/#thursday-am-session","text":"Welcome and Background The Landscape of Research Computing Profiling your application and choosing appropriate computing resources Introduction to High Throughput Computing (HTC)","title":"Thursday AM Session"},{"location":"Materials/School/#thursday-pm-session","text":"Workflows and distributed environments Workflows with Directed Acyclic Graphs (DAGs)","title":"Thursday PM Session"},{"location":"Materials/School/#friday-am-session","text":"Introduction to cloud environemts Close out and resources for further collaboration","title":"Friday AM Session"},{"location":"Materials/School/#detailed-schedule","text":"","title":"Detailed Schedule"},{"location":"Materials/School/#thursday-20-july-2017","text":"Time Discription Instructor 09:00 Welcome and the Landscape of Research Computing Rob Quick 09:30 Exercise - UNIX Refresher, Running Locally, Batch Services Rob Quick 10:30 Coffee Break 11:00 Profiling your application and finding a home for your workflow Rob Quick 11:45 Exercise - Single and batch submittion with HTCondor Rob Quick 13:00 Lunch 14:00 Worflows and distributed environments Rob Quick 14:30 Exercise - Accessing Distributed Resources on the Open Science Grid Rob Quick 15:15 Coffee Break 15:45 Workflows with Directed Acyclic Graphs (DAGs) Rob Quick 16:15 Exercise - DAGMAN Rob Quick","title":"Thursday 20-July 2017"},{"location":"Materials/School/#friday-21-july-2017","text":"Time Discription Instructor 09:00 Introduction to cloud environments Gergely Sipos 09:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 10:30 Coffee Break 11:00 Cloud Compting Resources Gergely Sipos 11:45 Exercise - Using the EGI Fed Cloud Gergely Sipos 13:00 Closing Thoughts Rob Quick 13:30 Lunch","title":"Friday 21-July 2017"},{"location":"Materials/School/#materials","text":"Materials Page","title":"Materials"},{"location":"Meetings/12March2019/","text":"DOSAR Group Meeting Tuesday, March 12 2019 10:30 ET, 9:30 CT Meeting Coordinates: https://iu.zoom.us/my/rquick Attendees: Apologies: Upcoming Events ASP2020 Site Planning Visit - Marrakech/Rabat, Morocco - April 1 to 5 - 2019 CODATA/RDA School of Research Data Science - Trieste, Italy - August 5 to August 16 - 2019 New Initiatives","title":"March 12, 2019"},{"location":"Meetings/12March2019/#dosar-group-meeting","text":"Tuesday, March 12 2019 10:30 ET, 9:30 CT Meeting Coordinates: https://iu.zoom.us/my/rquick Attendees: Apologies:","title":"DOSAR Group Meeting"},{"location":"Meetings/12March2019/#upcoming-events","text":"ASP2020 Site Planning Visit - Marrakech/Rabat, Morocco - April 1 to 5 - 2019 CODATA/RDA School of Research Data Science - Trieste, Italy - August 5 to August 16 - 2019","title":"Upcoming Events"},{"location":"Meetings/12March2019/#new-initiatives","text":"","title":"New Initiatives"},{"location":"Meetings/24October2018/","text":"DOSAR Group Meeting Wednesday, October 24 12:00 ET, 11:00 CT Meeting Coordinates: https://iu.zoom.us/my/rquick Attendees: Apologies: Upcoming Events CODATA/RDA School of Research Data Science - Kigali, Rwanda - October 22 to November 2 -2018 CODATA/RDA School of Research Data Science - Sao Paulo, Brazil - December 3 to December 14 - 2018 CODATA/RDA School of Research Data Science - Trieste, Italy - August 5 to August 16 - 2019 New Initiatives","title":"October 24, 2018"},{"location":"Meetings/24October2018/#dosar-group-meeting","text":"Wednesday, October 24 12:00 ET, 11:00 CT Meeting Coordinates: https://iu.zoom.us/my/rquick Attendees: Apologies:","title":"DOSAR Group Meeting"},{"location":"Meetings/24October2018/#upcoming-events","text":"CODATA/RDA School of Research Data Science - Kigali, Rwanda - October 22 to November 2 -2018 CODATA/RDA School of Research Data Science - Sao Paulo, Brazil - December 3 to December 14 - 2018 CODATA/RDA School of Research Data Science - Trieste, Italy - August 5 to August 16 - 2019","title":"Upcoming Events"},{"location":"Meetings/24October2018/#new-initiatives","text":"","title":"New Initiatives"}]}
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index e58e553..35294ec 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -1,87 +1,87 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><url>
      <loc>https://osg-htc.org/dosar/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/ASP2024/ASP2024_Schedule/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/ASP2024/ASP2024_Materials/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/ASP2022/ASP2022_Schedule/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/ASP2022/ASP2022_Materials/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/DataTrieste2019/School/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/DataTrieste2019/Materials/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/DataSaoPaulo2018/School/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/DataSaoPaulo2018/Materials/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/DataKigali2018/School/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/DataKigali2018/Materials/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/DataTrieste2018/School/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/DataTrieste2018/Materials/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/ASP2018/ASP2018_Schedule/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/ASP2018/ASP2018_Materials/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/Materials/DSP_DT2017/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/Materials/DSP_Materials/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/Materials/PH_DT2017/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/Materials/PH_Materials/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/Meetings/24October2018/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>https://osg-htc.org/dosar/Meetings/12March2019/</loc>
-     <lastmod>2024-07-07</lastmod>
+     <lastmod>2024-07-08</lastmod>
      <changefreq>daily</changefreq>
     </url>
 </urlset>
\ No newline at end of file
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index 91286516fa2317646fa08415c5023fc2d810b59d..a47881828069626ac81f1dd907e70fd27c837b23 100644
GIT binary patch
literal 365
zcmV-z0h0b7iwFoHR*Yr>|8r?{Wo=<_E_iKh0L_<CYlAQlfZzKo#CMIALhZ0AgE9uA
zOt5<%w|LV)YPwvyZvTE!EbZR*GB{rjFL&hggVR`Uz78h(L`T~8>v*1KafH3HjqclZ
z{IY*Yig+Vu<;fXBABEzc)^X?k@lJ^I`J9sWcz`3N*0q9E7j8&QNs*<iSPs2%77bTv
ztdc`Bg>Fl2(s?yDHkQDNQVi>tQXqe@jm)!rkz~sxE5h=Wn@Us%eT(l7KV$>ie3#-T
zGuD)1*!=GV_jNVwcZsicc%bHFkRjR*JUR`A*s$f7$WPl9jgz)5c-T9Q<T20YD-pC=
zYZ!uYlU9SR;ACuoD|lJ^MaE6rlWxIie$sBZ_>O16bv$`V8{e~mX^QMV1vhA%v|U-Y
zb@jU42f!seES|!uM~=P%6)RncJ{;*u<cnwJ?Hjsr6c~Zc^Bp+VjpvR3!kUdC{K&{3
LU4UR{_zVC5H@LP=

literal 365
zcmV-z0h0b7iwFqX1B+$?|8r?{Wo=<_E_iKh0L_<CYlAQlfZzKo#P=GNLfc_e24xIJ
znPB%^Zt<pp)O5LY-TwWeSlYeqrEnq~Uhc@}2hpThe;-Wrg^slCR&kc3afF?+weFf#
z{I+|VF5<PE6enj0eH4m2TE(sR#|I(K=W{~Z;{lG8Sl0+rZFnFtCPkVoW7+r0Nz`A3
zu}b#M6uK=n)7Gn@v9<(G6rvx$lmhvKt!19(vuQdHcNiXXV~J{^Yw*M2r>sGnpF-SZ
zhMGe3lmDOKu`2udE^%Ll7phMN8KP~$qtjrB4O@PR{Ip%sIBDB}hn>?%9`iI?ilEI}
z!w`&{v<hqqCu0NLf|s>FWZbm9=mw1DC+&ud@A6!5U7oz84d2<oG)8v6f*UkW+O8~{
zs@!jO0dUR^i>I*inWL{j#Yz{V4_kT?`RrAB`;Kl{3XH(Ad<#ys!+zslSd$@ygN*zI
LY9y`m_zVC5nE<$`