% Start a job on the OMNI cluster and retrieve the results afterwards.
%
% How to use this script:
% 1. Copy this script somewhere on your Matlab path (this includes your
% current folder).
% 2. Set the options at the top of this script.
% 3. Go to the folder that contains the m-file that you intend to
% run on the cluster. All other necessary files need to be in this
% folder, it will be copied to the cluster as a whole.
% 4. Run this script, see below for possible arguments.
% 5. When your job is complete, go to the same local directory and use the
% 'retrieve' option to get the folder back. The retrieved folder will
% be copied to '..', i.e. it will appear next to the orginal folder.
%
% Possible arguments:
% * Running with no arguments is equivalent to: "generate"+"copy"+"queue"
% * "copy", "cp", "c": Copy the folder to the cluster
% * "generate", "gen", "g": Generate job script
% * "queue", "q": Queue job with sbatch
% * "retrieve", "ret", "r": Retrieve the folder from the cluster
% * "simulate", "sim", "s": Do not create a job script or log into the
% cluster, only print
% what the output would be. This can be used together
% with any of the other options for testing purposes.
%
% You can also use multiple commands, separated by spaces. Examples:
% >> horus sim retrieve
% >> horus cp q
%
% Notes:
%
% * This script benefits greatly from a password-less login.
% To set up a password-less login, see the OMNI cluster website.
% If you have not set up a password-less login, you will need to enter your
% password at some points.
%
% * For Windows users: you need to have SSH installed and password-less
% login is NOT optional. SSH is included in Windows 10 version 1809 or
% newer. You can test if you have ssh installed by entering 'ssh' in the
% Windows command console (launch the console by using the Windows key
% and then typing 'cmd'). If you have no 'ssh' command, there are other
% options which the HPC support team would love to test out with you,
% please contact us!
%
% * This is a beta version! Please report any problems and suggestions to
% hpc-support@uni-siegen.de.
function omni(varargin)
clc
%% User-specific settings
% Your username
user = 'js056352';
% Your e-mail address
email = 'jan.steiner@uni-siegen.de';
%% Required job settings
% The Matlab script that you want to run (without '.m' at the end).
mFile = 'jobtest1';
% SLURM settings for your job. Use 'spartition' on the cluster to see queues.
% Caution: job will be stopped when maximum runtime is reached!
queue = 'short';
walltime = '0-00:10:00';
% Size of the parallel pool if needed (0=not needed, max. 16)
poolSize = 0;
% Cluster base directory (e.g. workspace). Has to be abolute path.
% Directory containing will be copied into this as a subdirectory
% whose name will be identical to the job name.
clusterBaseDir = '/home/js056352';
%% Optional job settings
% If you don't specifically need these, you can leave them alone.
% Give the job a name (default: _). Will be set
% later if left empty.
jobName = '';
% Job script name (default: run_.slurm). Will be set later if
% left empty.
jobScriptName = '';
% Number of threads for Matlab to use (0 = automatic). By default this will be:
% * for regular Matlab jobs: all CPUs on the node
% * for parallel pool jobs: 1 (per worker)
nThreads = 0;
% Additional pre- and postprocessing steps, these will be added to the job script
% before and after the call to Matlab
preSteps = '';
postSteps = '';
% Whether you want to receive an email notification (default: 'END')
% For possible options, type 'man sbatch' on the cluster and look for
% 'mail-type'
mailType = 'END';
% Additional SLURM command line parameters. See 'sbatch' documentation for
% possible options.
sbatchOptions = '';
% Cluster address
clusterAddress = 'omni.zimt.uni-siegen.de';
% Log file names (will use SLURM defaults if left empty)
slurmStdoutLog = '';
slurmStderrLog = '';
% Name of mat file that will contain necessary info to copy data back.
retrievalInfoFileName = 'retrievalinfo.mat';
%% End of settings
% You should not modify the script below this line unless you
% absolutely know what you are doing!
%% Handle input arguments
if ~exist(['./' mFile '.m'],'file')
error( ['No mFile named ', mFile, '.m found!'] )
end
actions = setActions(nargin,varargin);
localDir = which(mFile);
localDir = fileparts( localDir );
% Handle Windows backslashes
if ispc
localDir = strrep(localDir,'\','\\');
end
timestamp = datestr(now, 'yyyy-mm-dd_HH-MM-SS');
%% Set options according to input arguments
% Set defaults
if strcmp(jobName,'')
jobName = [mFile,'_',timestamp];
end
if strcmp(jobScriptName,'')
jobScriptName = ['run_',mFile,'.slurm'];
end
clusterDir = [clusterBaseDir,'/',jobName];
% Determine settings for selected queue
maxThreads = 64;
% Multithreading and parallel processing
if nThreads == 0 && poolSize <= 1
if ( strcmp(queue,'smp') )
error( 'The number of threads has to be set manually when using the SMP queue' );
end
nTasks = 1;
nThreads = maxThreads;
elseif nThreads == 0
if ( strcmp(queue,'smp') )
error( 'The number of threads has to be set manually when using the SMP queue' );
end
nTasks = poolSize;
nThreads = 1;
else
nTasks = poolSize;
end
% Checks
if nThreads > maxThreads
error( ['Too many threads for nodes in ' queue ' partition!'] )
end
if nTasks > 16
error('More than 16 Matlab workers are not possible on the OMNI cluster!')
end
disp('OMNI job utility script.')
disp('Actions performed:')
for currAct = keys(actions)
if actions(currAct{1})
disp([ ' -> ' currAct{1} ])
end
end
%% Generate job script
if actions('generate')
fprintf(['\nGenerating job script, filename: ' jobScriptName '\n'])
% Compose job script content
batchContent='';
batchContent=sprintf('%s%s\n', batchContent, '#!/bin/bash');
batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH -J ' jobName ]);
batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH -n ' num2str(nTasks) ]);
batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH --partition=' queue]);
batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH --time=' walltime]);
batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH --cpus-per-task=' num2str(nThreads) ]);
batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH --mail-user=' email]);
batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH --mail-type=' mailType]);
if ~strcmp(slurmStdoutLog,'')
batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH --output=' slurmStdoutLog]);
end
if ~strcmp(slurmStderrLog,'')
batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH --error=' slurmStderrLog]);
end
if ( ~strcmp(sbatchOptions,'') )
batchContent=sprintf('%s%s\n', batchContent, '# Additional command line options used when queueing this script:');
batchContent=sprintf('%s%s\n', batchContent, ['# ' sbatchOptions]);
end
batchContent=sprintf('%s%s\n', batchContent, '');
batchContent=sprintf('%s%s\n', batchContent, '# Begin of job commands');
batchContent=sprintf('%s%s\n', batchContent, 'module load matlab');
batchContent=sprintf('%s%s\n', batchContent, ['cd ' clusterDir '/']);
batchContent=sprintf('%s%s\n', batchContent, '');
if ~strcmp( preSteps, '' )
batchContent=sprintf('%s%s\n', batchContent, '# Preprocessing steps');
batchContent=sprintf('%s%s\n', batchContent, preSteps);
batchContent=sprintf('%s%s\n', batchContent, '');
end
batchContent=sprintf('%s%s\n', batchContent, '# Primary Matlab call');
batchContent=sprintf('%s%s\n', batchContent, ['matlab -nodisplay -r "' mFile '; quit"']);
if ~strcmp( postSteps, '' )
batchContent=sprintf('%s%s\n', batchContent, '');
batchContent=sprintf('%s%s\n', batchContent, '# Postprocessing steps');
batchContent=sprintf('%s%s\n', batchContent, postSteps);
end
if ~actions('simulate')
% Save/append content to file
jobScriptFile = fopen(jobScriptName, 'w');
fprintf(jobScriptFile, '%s', batchContent);
fclose(jobScriptFile);
% Wait for the file to be written
while exist(jobScriptName,'file') ~= 2
pause(0.5)
end
else
fprintf(' No Job script written due to simulation mode. Jobscript contents:\n')
fprintf('-------- Begin of job script --------\n')
fprintf(batchContent)
fprintf('-------- End of job script --------\n')
end
end
%% Copy all necessary files to the cluster
if actions('copy')
fprintf('\nCopying files to cluster. You may need to enter your password if prompted:\n')
copycmd = ['scp -r ' localDir ' ' user '@' clusterAddress ':' clusterDir];
if ~actions('simulate')
cmdStatus = system(copycmd,'-echo');
if cmdStatus ~= 0
error('Unable to copy files to cluster, aborting!');
end
else
fprintf(' No SCP command executed due to simulation mode. Command string:\n');
fprintf([' ' copycmd '\n']);
end
% Save info necessary to copy all results from the cluster back here
fprintf( ['\nSaving retrieval info to: ' retrievalInfoFileName '\n\n'] )
save( retrievalInfoFileName, 'user', 'clusterAddress', 'localDir', 'clusterDir' );
end
%% Login and queue the script
if actions('queue')
fprintf('Queuing job on cluster.\n')
load( retrievalInfoFileName, 'user', 'clusterAddress', 'clusterDir' );
sbatchcmd = ['sbatch ' sbatchOptions ' ' jobScriptName];
jobqueuecmd = ['ssh ' user '@' clusterAddress ' "cd ' clusterDir '; ' sbatchcmd '"'];
if ~actions('simulate')
cmdStatus = system(jobqueuecmd, '-echo');
if cmdStatus ~= 0
error('Unable to queue job on cluster, aborting!')
end
else
fprintf(' No SSH command executed due to simulation mode. Command string:\n');
fprintf([' ' jobqueuecmd '\n']);
end
end
%% Retrieve results from cluster
if actions('retrieve')
retrieve( retrievalInfoFileName, actions('simulate') )
end
%% Housekeeping
fprintf('\nScript successfully completed.\n')
end
function actions = setActions(nArgs, args)
%% Initialize
actions = containers.Map('KeyType','char','ValueType','logical');
%% Set
if any( strcmp(args,'simulate') ) || ...
any( strcmp(args,'sim') ) || ...
any( strcmp(args,'s') )
actions('simulate') = true;
else
actions('simulate') = false;
end
if any( strcmp(args,'generate') ) || ...
any( strcmp(args,'gen') ) || ...
any( strcmp(args,'g') )
actions('generate') = true;
else
actions('generate') = false;
end
if any( strcmp(args,'copy') ) || ...
any( strcmp(args,'cp') ) || ...
any( strcmp(args,'c') )
actions('copy') = true;
else
actions('copy') = false;
end
if any( strcmp(args,'queue') ) || ...
any( strcmp(args,'q') )
actions('queue') = true;
else
actions('queue') = false;
end
if any( strcmp(args,'retrieve') ) || ...
any( strcmp(args,'ret') ) || ...
any( strcmp(args,'r') )
actions('retrieve') = true;
else
actions('retrieve') = false;
end
if ( nArgs == 1 && actions('simulate') ) || nArgs == 0
actions('generate') = true;
actions('copy') = true;
actions('queue') = true;
end
%% Check
if actions('retrieve') && ( actions('queue') || actions('generate') || actions('copy') )
warning('Attempting to retrieve a folder immediately after creation. Results will be incomplete or nonexistent.')
end
end
function retrieve(retInfoFileName,simMode)
%% Load the necessary information
if ( ~exist(retInfoFileName, 'file' ) )
error('Unable to find retrieval info file!')
end
load( retInfoFileName, 'user', 'clusterAddress', 'clusterDir', 'localDir' );
%% Copy the folder to the local PC
copycmd = ['scp -r ' user '@' clusterAddress ':' clusterDir ' ' localDir '/..'];
fprintf('Retrieving result data from cluster.\n')
fprintf('You may need to enter your password if prompted.\n')
if ~simMode
copyStatus = system(copycmd,'-echo');
if copyStatus ~= 0
error(['Could not copy directory ' clusterDir ' to local computer!\n' ...
'SCP command reports status' copyStatus])
end
else
fprintf( ' No SCP command executed due to simulation mode. Command syntax:\n')
fprintf([' ' copycmd '\n'])
end
fprintf('Script completed.\n')
end