% Start a job on the OMNI cluster and retrieve the results afterwards. % % How to use this script: % 1. Copy this script somewhere on your Matlab path (this includes your % current folder). % 2. Set the options at the top of this script. % 3. Go to the folder that contains the m-file that you intend to % run on the cluster. All other necessary files need to be in this % folder, it will be copied to the cluster as a whole. % 4. Run this script, see below for possible arguments. % 5. When your job is complete, go to the same local directory and use the % 'retrieve' option to get the folder back. The retrieved folder will % be copied to '..', i.e. it will appear next to the orginal folder. % % Possible arguments: % * Running with no arguments is equivalent to: "generate"+"copy"+"queue" % * "copy", "cp", "c": Copy the folder to the cluster % * "generate", "gen", "g": Generate job script % * "queue", "q": Queue job with sbatch % * "retrieve", "ret", "r": Retrieve the folder from the cluster % * "simulate", "sim", "s": Do not create a job script or log into the % cluster, only print % what the output would be. This can be used together % with any of the other options for testing purposes. % % You can also use multiple commands, separated by spaces. Examples: % >> horus sim retrieve % >> horus cp q % % Notes: % % * This script benefits greatly from a password-less login. % To set up a password-less login, see the OMNI cluster website. % If you have not set up a password-less login, you will need to enter your % password at some points. % % * For Windows users: you need to have SSH installed and password-less % login is NOT optional. SSH is included in Windows 10 version 1809 or % newer. You can test if you have ssh installed by entering 'ssh' in the % Windows command console (launch the console by using the Windows key % and then typing 'cmd'). If you have no 'ssh' command, there are other % options which the HPC support team would love to test out with you, % please contact us! % % * This is a beta version! Please report any problems and suggestions to % hpc-support@uni-siegen.de. function omni(varargin) clc %% User-specific settings % Your username user = 'js056352'; % Your e-mail address email = 'jan.steiner@uni-siegen.de'; %% Required job settings % The Matlab script that you want to run (without '.m' at the end). mFile = 'jobtest1'; % SLURM settings for your job. Use 'spartition' on the cluster to see queues. % Caution: job will be stopped when maximum runtime is reached! queue = 'short'; walltime = '0-00:10:00'; % Size of the parallel pool if needed (0=not needed, max. 16) poolSize = 0; % Cluster base directory (e.g. workspace). Has to be abolute path. % Directory containing will be copied into this as a subdirectory % whose name will be identical to the job name. clusterBaseDir = '/home/js056352'; %% Optional job settings % If you don't specifically need these, you can leave them alone. % Give the job a name (default: _). Will be set % later if left empty. jobName = ''; % Job script name (default: run_.slurm). Will be set later if % left empty. jobScriptName = ''; % Number of threads for Matlab to use (0 = automatic). By default this will be: % * for regular Matlab jobs: all CPUs on the node % * for parallel pool jobs: 1 (per worker) nThreads = 0; % Additional pre- and postprocessing steps, these will be added to the job script % before and after the call to Matlab preSteps = ''; postSteps = ''; % Whether you want to receive an email notification (default: 'END') % For possible options, type 'man sbatch' on the cluster and look for % 'mail-type' mailType = 'END'; % Additional SLURM command line parameters. See 'sbatch' documentation for % possible options. sbatchOptions = ''; % Cluster address clusterAddress = 'omni.zimt.uni-siegen.de'; % Log file names (will use SLURM defaults if left empty) slurmStdoutLog = ''; slurmStderrLog = ''; % Name of mat file that will contain necessary info to copy data back. retrievalInfoFileName = 'retrievalinfo.mat'; %% End of settings % You should not modify the script below this line unless you % absolutely know what you are doing! %% Handle input arguments if ~exist(['./' mFile '.m'],'file') error( ['No mFile named ', mFile, '.m found!'] ) end actions = setActions(nargin,varargin); localDir = which(mFile); localDir = fileparts( localDir ); % Handle Windows backslashes if ispc localDir = strrep(localDir,'\','\\'); end timestamp = datestr(now, 'yyyy-mm-dd_HH-MM-SS'); %% Set options according to input arguments % Set defaults if strcmp(jobName,'') jobName = [mFile,'_',timestamp]; end if strcmp(jobScriptName,'') jobScriptName = ['run_',mFile,'.slurm']; end clusterDir = [clusterBaseDir,'/',jobName]; % Determine settings for selected queue maxThreads = 64; % Multithreading and parallel processing if nThreads == 0 && poolSize <= 1 if ( strcmp(queue,'smp') ) error( 'The number of threads has to be set manually when using the SMP queue' ); end nTasks = 1; nThreads = maxThreads; elseif nThreads == 0 if ( strcmp(queue,'smp') ) error( 'The number of threads has to be set manually when using the SMP queue' ); end nTasks = poolSize; nThreads = 1; else nTasks = poolSize; end % Checks if nThreads > maxThreads error( ['Too many threads for nodes in ' queue ' partition!'] ) end if nTasks > 16 error('More than 16 Matlab workers are not possible on the OMNI cluster!') end disp('OMNI job utility script.') disp('Actions performed:') for currAct = keys(actions) if actions(currAct{1}) disp([ ' -> ' currAct{1} ]) end end %% Generate job script if actions('generate') fprintf(['\nGenerating job script, filename: ' jobScriptName '\n']) % Compose job script content batchContent=''; batchContent=sprintf('%s%s\n', batchContent, '#!/bin/bash'); batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH -J ' jobName ]); batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH -n ' num2str(nTasks) ]); batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH --partition=' queue]); batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH --time=' walltime]); batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH --cpus-per-task=' num2str(nThreads) ]); batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH --mail-user=' email]); batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH --mail-type=' mailType]); if ~strcmp(slurmStdoutLog,'') batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH --output=' slurmStdoutLog]); end if ~strcmp(slurmStderrLog,'') batchContent=sprintf('%s%s\n', batchContent, ['#SBATCH --error=' slurmStderrLog]); end if ( ~strcmp(sbatchOptions,'') ) batchContent=sprintf('%s%s\n', batchContent, '# Additional command line options used when queueing this script:'); batchContent=sprintf('%s%s\n', batchContent, ['# ' sbatchOptions]); end batchContent=sprintf('%s%s\n', batchContent, ''); batchContent=sprintf('%s%s\n', batchContent, '# Begin of job commands'); batchContent=sprintf('%s%s\n', batchContent, 'module load matlab'); batchContent=sprintf('%s%s\n', batchContent, ['cd ' clusterDir '/']); batchContent=sprintf('%s%s\n', batchContent, ''); if ~strcmp( preSteps, '' ) batchContent=sprintf('%s%s\n', batchContent, '# Preprocessing steps'); batchContent=sprintf('%s%s\n', batchContent, preSteps); batchContent=sprintf('%s%s\n', batchContent, ''); end batchContent=sprintf('%s%s\n', batchContent, '# Primary Matlab call'); batchContent=sprintf('%s%s\n', batchContent, ['matlab -nodisplay -r "' mFile '; quit"']); if ~strcmp( postSteps, '' ) batchContent=sprintf('%s%s\n', batchContent, ''); batchContent=sprintf('%s%s\n', batchContent, '# Postprocessing steps'); batchContent=sprintf('%s%s\n', batchContent, postSteps); end if ~actions('simulate') % Save/append content to file jobScriptFile = fopen(jobScriptName, 'w'); fprintf(jobScriptFile, '%s', batchContent); fclose(jobScriptFile); % Wait for the file to be written while exist(jobScriptName,'file') ~= 2 pause(0.5) end else fprintf(' No Job script written due to simulation mode. Jobscript contents:\n') fprintf('-------- Begin of job script --------\n') fprintf(batchContent) fprintf('-------- End of job script --------\n') end end %% Copy all necessary files to the cluster if actions('copy') fprintf('\nCopying files to cluster. You may need to enter your password if prompted:\n') copycmd = ['scp -r ' localDir ' ' user '@' clusterAddress ':' clusterDir]; if ~actions('simulate') cmdStatus = system(copycmd,'-echo'); if cmdStatus ~= 0 error('Unable to copy files to cluster, aborting!'); end else fprintf(' No SCP command executed due to simulation mode. Command string:\n'); fprintf([' ' copycmd '\n']); end % Save info necessary to copy all results from the cluster back here fprintf( ['\nSaving retrieval info to: ' retrievalInfoFileName '\n\n'] ) save( retrievalInfoFileName, 'user', 'clusterAddress', 'localDir', 'clusterDir' ); end %% Login and queue the script if actions('queue') fprintf('Queuing job on cluster.\n') load( retrievalInfoFileName, 'user', 'clusterAddress', 'clusterDir' ); sbatchcmd = ['sbatch ' sbatchOptions ' ' jobScriptName]; jobqueuecmd = ['ssh ' user '@' clusterAddress ' "cd ' clusterDir '; ' sbatchcmd '"']; if ~actions('simulate') cmdStatus = system(jobqueuecmd, '-echo'); if cmdStatus ~= 0 error('Unable to queue job on cluster, aborting!') end else fprintf(' No SSH command executed due to simulation mode. Command string:\n'); fprintf([' ' jobqueuecmd '\n']); end end %% Retrieve results from cluster if actions('retrieve') retrieve( retrievalInfoFileName, actions('simulate') ) end %% Housekeeping fprintf('\nScript successfully completed.\n') end function actions = setActions(nArgs, args) %% Initialize actions = containers.Map('KeyType','char','ValueType','logical'); %% Set if any( strcmp(args,'simulate') ) || ... any( strcmp(args,'sim') ) || ... any( strcmp(args,'s') ) actions('simulate') = true; else actions('simulate') = false; end if any( strcmp(args,'generate') ) || ... any( strcmp(args,'gen') ) || ... any( strcmp(args,'g') ) actions('generate') = true; else actions('generate') = false; end if any( strcmp(args,'copy') ) || ... any( strcmp(args,'cp') ) || ... any( strcmp(args,'c') ) actions('copy') = true; else actions('copy') = false; end if any( strcmp(args,'queue') ) || ... any( strcmp(args,'q') ) actions('queue') = true; else actions('queue') = false; end if any( strcmp(args,'retrieve') ) || ... any( strcmp(args,'ret') ) || ... any( strcmp(args,'r') ) actions('retrieve') = true; else actions('retrieve') = false; end if ( nArgs == 1 && actions('simulate') ) || nArgs == 0 actions('generate') = true; actions('copy') = true; actions('queue') = true; end %% Check if actions('retrieve') && ( actions('queue') || actions('generate') || actions('copy') ) warning('Attempting to retrieve a folder immediately after creation. Results will be incomplete or nonexistent.') end end function retrieve(retInfoFileName,simMode) %% Load the necessary information if ( ~exist(retInfoFileName, 'file' ) ) error('Unable to find retrieval info file!') end load( retInfoFileName, 'user', 'clusterAddress', 'clusterDir', 'localDir' ); %% Copy the folder to the local PC copycmd = ['scp -r ' user '@' clusterAddress ':' clusterDir ' ' localDir '/..']; fprintf('Retrieving result data from cluster.\n') fprintf('You may need to enter your password if prompted.\n') if ~simMode copyStatus = system(copycmd,'-echo'); if copyStatus ~= 0 error(['Could not copy directory ' clusterDir ' to local computer!\n' ... 'SCP command reports status' copyStatus]) end else fprintf( ' No SCP command executed due to simulation mode. Command syntax:\n') fprintf([' ' copycmd '\n']) end fprintf('Script completed.\n') end