#PBS -S /bin/sh # # Specify the project the job should be accounted on (obligatory) #PBS -A cbu # # Compute resources (obligatory) #PBS -l walltime=40:00:00,nodes=5:ppn=2 # # Space resources #PBS -l pmem=1000mb # # #WHAT THIS SCRIPT DOES #This script allows you to run MPI Blast in parallel on the cluster #Input 1: a sequence database file in fasta format (set this below) #Input 2: a sequence query file in fasta format (set this below) #Output: 3 files (all these file will be saved to the directory from which you call this script) #1.a result file with the output of the blast search. Filename: thisScript.rJobNumber where thisScript is the name you give this file and JobNumber is the job number that you get when you call "qsub thisScript". #2. a file containing the standard output from the blast search. Filename: thisScript.oJobNumber #3. a file containing the standard error from the blast search. Filename: thisScript.eJobNumber # #TO USE THIS SCRIPT #1. Create a file ~/.ncbirc containing (see blast documentation) #[NCBI] #Data=absPathToBlastMatriceDir #2.Set the name and location of your input files (set this below) # #FINE TUNING THE SCRIPT #It may be necessary to change the resource requirements in the PBS header of this file depending on the details of your blast search #You may also want to change the details of the blast search (see further down). The current settings are for a blastp search #Set up input and output directories and files #Each user must set these values correctly databaseFile="Homo_sapiens.NCBI35.nov.pep.fa"; databaseDir="/work/tim/data/project_vertebrateGeneTrees/runs/4_run/01_blast/DATA/"; queriesFile="Homo_sapiens.NCBI35.nov.pep.queries.fa"; queriesDir="/work/tim/data/project_vertebrateGeneTrees/runs/4_run/01_blast/DATA/"; #No need to modify anything below here unless you want to modify the details of the blast search (see blast search section) databaseFullPath="${databaseDir}/${databaseFile}"; queriesFullPath="${queriesDir}/${queriesFile}"; echo "DB file ${databaseFullPath}"; echo "Query file ${queriesFullPath}"; jobnumber=`echo ${PBS_JOBID} | sed 's/\..*//'`; resultsFile="${PBS_JOBNAME}.r${jobnumber}"; resultsDir="${PBS_O_WORKDIR}"; resultsFullPath="${resultsDir}/${resultsFile}"; echo "The results files will be found in ${resultsDir}"; #Set up the shared storage #This setup avoids different jobs that get assigned the same node from overwritting each others data SHARED="/work/${LOGNAME}/${jobnumber}"; mkdir -p ${SHARED}; cd ${SHARED}; #Set local storage LOCAL="/scratch/${LOGNAME}/${jobnumber}"; echo "LOCAL is ${LOCAL}"; for node in `cat $PBS_NODEFILE | sort | uniq` ; do mkdir -p /net/${node}/${LOCAL} done #Set environment variables used by mpiformatdb and mpiblast export MPIBLAST_SHARED=${SHARED}; export MPIBLAST_LOCAL=${LOCAL}; # How many cpus are we running on?: CPUS=`cat $PBS_NODEFILE | wc -l` echo "Running on ${CPUS} cpus"; #Unsure what this is for... P4_GLOBMEMSIZE=171966464 export P4_GLOBMEMSIZE #Format the database echo "Started /local/mpiblast-1.4.0/bin/mpiformatdb at `date`"; /local/mpiblast-1.4.0/bin/mpiformatdb -N ${CPUS} -i ${databaseFullPath} -p T echo "Finished /local/mpiblast-1.4.0/bin/mpiformatdb at `date`"; #Blast search details #Most options of this command are the standard blast options (see the standard blast documentation) #The last line contains options that are specific to MPI blast #--concurrent controls copying of data to the nodes #--removedb should remove the databases from the nodes when the search is finished #--db-replicate-count allows multiple nodes to search the same database. This is an important option as very large blast searches i.e. with many queries sometimes begin to consume huge amounts of RAM and eventually fail. By replicating the database bits it allows multiple nodes to search the same database fractions. echo "Started /local/mpiblast-1.4.0/bin/mpiblast at `date`"; mpiexec /local/mpiblast-1.4.0/bin/mpiblast \ -d ${databaseFile} \ -i ${queriesFullPath} \ -p blastp \ -o ${resultsFullPath} \ -F T -M BLOSUM62 -W 3 -G 11 -E 1 -g T -m8 -v 3000 -b 3000 -e 1e-5 \ --concurrent=10 --removedb --db-replicate-count=4 echo "Finished /local/mpiblast-1.4.0/bin/mpiblast at `date`"; # Clean up: cd /work/${LOGNAME} for node in `cat $PBS_NODEFILE | sort | uniq` ; do rm -rf /net/${node}/${LOCAL}; done rm -rf ${SHARED};