Source code for hylite.samtools_wrapper

#!/usr/bin/env python3

#    (c) Copyright 2013-2018 Murray Cox, Wandrille Duchemin, Pierre-Yves Dupont.
#
#
#    This file is part of HyLiTE.
#
#    HyLiTE is a free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License version 3 as published by
#    the Free Software Foundation.
#
#    HyLiTE is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with HyLiTE.  If not, see <http://www.gnu.org/licenses/>


#===============================#
# author: Wandrille Duchemin    #
#         Murray Cox            #
# last modified: 9 January 2018 #
#===============================#


from .Parameters import Parameters
from .Generic_wrapper import Generic_wrapper #import os too

import os
from threading import Thread
import sys

[docs]class samtools_wrapper(Generic_wrapper): ''' A wrapper for samtools ''' def __init__(self): ''' A wrapper for samtools ''' param = Parameters() Generic_wrapper.__init__(self, param.get_param('SAMTOOLS_PATH'),\ {'view': param.get_param('SAMTOOLS_NAME_VIEW'),\ "sort": param.get_param('SAMTOOLS_NAME_SORT'),\ "index": param.get_param('SAMTOOLS_NAME_INDEX'),\ "mpileup": param.get_param('SAMTOOLS_NAME_MPILEUP'),\ "faidx": param.get_param('SAMTOOLS_NAME_FAIDX') },\ {'in' :param.get_param('SAMTOOLS_OPTION_IN'),\ 'view_out': param.get_param('SAMTOOLS_VIEW_OPTION_OUT'),\ 'out': param.get_param('SAMTOOLS_OPTION_OUT'),\ 'noBAQ': param.get_param('SAMTOOLS_MPILEUP_OPTION_NOBAQ'),\ 'min_quality': param.get_param('SAMTOOLS_MPILEUP_OPTION_MINQUAL'),\ 'max_coverage': param.get_param('SAMTOOLS_MPILEUP_OPTION_MAXCOV'),\ 'ref': param.get_param('SAMTOOLS_MPILEUP_OPTION_REFERENCE'),\ 'sample_file': param.get_param('SAMTOOLS_MPILEUP_OPTION_SAMPLE'),\ 'paired': param.get_param('SAMTOOLS_MPILEUP_OPTION_PAIRED')}) #this should provide a flexible framework. if an option change, just modify the according variable in params.py #should you wish to add a new option, add an option to the params.py and add it to the options dictionnary return
[docs] def samtobam(self, samfile, bamfile): '''Convert .sam to .bam Args: - samfile (str): name of a .sam file - bamfile (str): name of the .bam file to write ''' basename = 'view' options_value = {'in':samfile,'view_out' : bamfile} order = ['in', 'view_out'] self.commandline = self.build_command_line(basename, options_value, order) if Parameters().get_param('VERBOSE'): self.print_command_line() p=self.run() if Parameters().get_param('VERBOSE'): sys.stdout.write(p.stdout.read().decode('utf8')+'\n') sys.stderr.write(p.stderr.read().decode('utf8')+'\n') else: # have to read it to execute the code p.stdout.read().decode('utf8') p.stderr.read().decode('utf8') return
[docs] def sort(self, bamfile, sortedprefix): '''Sort a .bam file Args: - bamfile (str): name of a .bam file - sortedprefix (str): prefix of the sorted .bam to write (.bam will be appended to your prefix) ''' basename='sort' if Parameters().get_param("SAMTOOLS_VERSION") == 0: options_value = {'in':bamfile,'out' : sortedprefix} elif Parameters().get_param("SAMTOOLS_VERSION") == 1: options_value = {'in':bamfile,'out' : sortedprefix+".bam"} else: raise IOError("Version of samtools not recognized\n") order = ['in', 'out'] self.commandline = self.build_command_line(basename, options_value, order) if Parameters().get_param('VERBOSE'): self.print_command_line() p=self.run() if Parameters().get_param('VERBOSE'): sys.stdout.write(p.stdout.read().decode('utf8')+'\n') sys.stderr.write(p.stderr.read().decode('utf8')+'\n') else: # have to read it to execute the code p.stdout.read().decode('utf8') p.stderr.read().decode('utf8') return
[docs] def index(self, sortedfile): '''Index a sorted .bam file Args: - sortedfile (str): name of a sorted .bam file ''' basename='index' options_value = {'in':sortedfile} order = ['in'] self.commandline = self.build_command_line(basename, options_value, order) if Parameters().get_param('VERBOSE'): self.print_command_line() p=self.run() if Parameters().get_param('VERBOSE'): sys.stdout.write(p.stdout.read().decode('utf8')+'\n') sys.stderr.write(p.stderr.read().decode('utf8')+'\n') else: # have to read it to execute the code p.stdout.read().decode('utf8') p.stderr.read().decode('utf8') return
[docs] def faidx(self, fastafile): '''Index the fasta file so it can be used by mpileup/sort/index as a reference Args: - fastafile (str): a fasta file name ''' basename='faidx' options_value = {'in':fastafile} order = ['in'] self.commandline = self.build_command_line(basename, options_value, order) if Parameters().get_param('VERBOSE'): self.print_command_line() p=self.run() if Parameters().get_param('VERBOSE'): sys.stdout.write(p.stdout.read().decode('utf8')+'\n') sys.stderr.write(p.stderr.read().decode('utf8')+'\n') else: # have to read it to execute the code p.stdout.read().decode('utf8') p.stderr.read().decode('utf8') return
[docs] def mpileup(self, reffile, samplefile, paired, pilefile): '''Make the pileup file containing all SNP and reads data. Args: - reffile (str): the name of the indexed fasta file used as reference - samplefile (str): the name of a file containing the list of the sorted .bam files to pileup (one file by line, grouped by organism, child always first) - paired (bool): a boolean set to True if at least one organism has paired end reads - pilefile (str): the name of the pileup file to write ''' basename = 'mpileup' options_value = {'noBAQ':True, 'paired':paired, 'min_quality': 0 , 'max_coverage': 1000000, 'ref': reffile, 'sample_file':samplefile} order = ['noBAQ', 'paired', 'min_quality', 'max_coverage', 'ref', 'sample_file'] self.commandline = self.build_command_line(basename, options_value, order) if Parameters().get_param('VERBOSE'): self.print_command_line() p=self.run() #in the case of mpileup, we redirect the std entry to a file (we could avoid that and just feed it to pileup reader. However, the user could have another use for the .pileup file) handle = open(pilefile, 'w') handle.write(p.stdout.read().decode('utf8')) handle.close() if Parameters().get_param('VERBOSE'): sys.stderr.write(p.stderr.read().decode('utf8')+'\n') else: p.stderr.read().decode('utf8') return
[docs] def prep_sam_file(self, samfile, outdir): '''Convert to .bam, sort and index the file Args: - samfile (str): a .sam file name - outdir (str): the path of the directory where the files will be written ''' fileprefix = outdir if fileprefix[-1]!=os.sep: fileprefix += os.sep #we complete (if need be) the name of the out directory fileprefix += samfile.rpartition('.sam')[0].rpartition(os.sep)[2] #we append the general prefix of the file name #converting to bam file bamfile = fileprefix+'.bam' self.samtobam(samfile, bamfile) #sorting self.sort(bamfile, fileprefix+'.sorted') #indexing sortedfile = fileprefix+'.sorted'+'.bam' self.index(sortedfile) return
[docs] def pipeline_sequential(self, lfile, outdir):#destined to be given to a Thread '''Treat sequentially a list of files Args: - lfile (list): a list of .sam filenames (ordered by organism and sample) - outdir (str): the path of the directory where the files will be written ''' for f in lfile: self.prep_sam_file(f, outdir) return
[docs] def pipeline(self, reffile, llfile, outdir, nb_thread, pilename): '''Uses the allowed threads to convert/sort/index the different sam files. Execute then the mpileup command to create the pileup. Args: - reffile (str): the name of the reference file (.fasta) - llfile (list): a list of list of file (ordered by organism and sample) - outdir (str): the path of the directory where the files will be written - nb_thread (int): the number of threads allowed - pilename (str): the name of the pileupfile to create Returns: - str. name of the .pileup file ''' #first, we index the reference file self.faidx(reffile) #we begin by establishing of the name of the final lsortedname = list() for lfile in llfile: for samfile in lfile: fileprefix = outdir if fileprefix[-1]!=os.sep: fileprefix += os.sep #we complete (if need be) the name of the out directory fileprefix = fileprefix + samfile.rpartition('.sam')[0].rpartition(os.sep)[2] + '.sorted.bam' lsortedname.append(fileprefix) if nb_thread<len(llfile):#less threads than organisms: we treat each file in a sequence lfile=list() for l in llfile: lfile += l self.pipeline_sequential(lfile, outdir) else: if nb_thread>=sum([len(x) for x in llfile]): #at least a thread by file:we treat each file independently l=list() for lfile in llfile: for f in lfile: l.append([f]) llfile = l #we replace the list of list by a list containing all file in a separate list #we can create the Thread strcture lthread=list() for lfile in llfile: #for each list of file lthread.append(Thread(target=self.pipeline_sequential, args=(lfile, outdir,))) #creation of the thread lthread[-1].daemon = True #just in case, the thread will be killed if HyLiTE is killed lthread[-1].start() #we now wait for all our threads to be finished for t in lthread: t.join() #not very elegant, but simple and efficient #by this point, every file has been correctly converted #we now create the file containing the name of the samples sample_file = pilename.rpartition('.pileup')[0] + '.txt' #same name as the pileup file, different suffixe fsample = open(sample_file, 'w') for sname in lsortedname: fsample.write(sname+'\n') fsample.close() #we can now launch the mpileup command self.mpileup(reffile, sample_file, True, pilename) #by default, we set paired to true, this have no effect on unpaired reads return pilename