Source code for hylite.samtools_wrapper

#!/usr/bin/env python3

#    (c) Copyright 2013-2018 Murray Cox, Wandrille Duchemin, Pierre-Yves Dupont.
#
#
#    This file is part of HyLiTE.
#
#    HyLiTE is a free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License version 3 as published by
#    the Free Software Foundation.
#
#    HyLiTE is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with HyLiTE.  If not, see <http://www.gnu.org/licenses/>


#===============================#
# author: Wandrille Duchemin    #
#         Murray Cox            #
# last modified: 9 January 2018 #
#===============================#


from .Parameters import Parameters
from .Generic_wrapper import Generic_wrapper #import os too

import os
from threading import Thread
import sys

[docs]class samtools_wrapper(Generic_wrapper):
    '''
    A wrapper for samtools
    '''
    def __init__(self):
        '''
        A wrapper for samtools
        '''
        param = Parameters()

        Generic_wrapper.__init__(self, param.get_param('SAMTOOLS_PATH'),\
        {'view': param.get_param('SAMTOOLS_NAME_VIEW'),\
         "sort": param.get_param('SAMTOOLS_NAME_SORT'),\
         "index": param.get_param('SAMTOOLS_NAME_INDEX'),\
         "mpileup": param.get_param('SAMTOOLS_NAME_MPILEUP'),\
         "faidx": param.get_param('SAMTOOLS_NAME_FAIDX') },\
        {'in' :param.get_param('SAMTOOLS_OPTION_IN'),\
         'view_out': param.get_param('SAMTOOLS_VIEW_OPTION_OUT'),\
         'out': param.get_param('SAMTOOLS_OPTION_OUT'),\
         'noBAQ': param.get_param('SAMTOOLS_MPILEUP_OPTION_NOBAQ'),\
         'min_quality': param.get_param('SAMTOOLS_MPILEUP_OPTION_MINQUAL'),\
         'max_coverage': param.get_param('SAMTOOLS_MPILEUP_OPTION_MAXCOV'),\
         'ref': param.get_param('SAMTOOLS_MPILEUP_OPTION_REFERENCE'),\
         'sample_file': param.get_param('SAMTOOLS_MPILEUP_OPTION_SAMPLE'),\
         'paired': param.get_param('SAMTOOLS_MPILEUP_OPTION_PAIRED')})
        #this should provide a flexible framework. if an option change, just modify the according variable in params.py
        #should you wish to add a new option, add an option to the params.py and add it to the options dictionnary
        return

[docs]    def samtobam(self, samfile, bamfile):
        '''Convert .sam to .bam

        Args:
             - samfile (str): name of a .sam file
             - bamfile (str): name of the .bam file to write

        '''
        basename = 'view'
        options_value = {'in':samfile,'view_out' : bamfile}
        order = ['in', 'view_out']
        self.commandline = self.build_command_line(basename, options_value, order)

        if Parameters().get_param('VERBOSE'):
            self.print_command_line()

        p=self.run()
        if Parameters().get_param('VERBOSE'):
            sys.stdout.write(p.stdout.read().decode('utf8')+'\n')
            sys.stderr.write(p.stderr.read().decode('utf8')+'\n')
        else: # have to read it to execute the code
            p.stdout.read().decode('utf8')
            p.stderr.read().decode('utf8')

        return


[docs]    def sort(self, bamfile, sortedprefix):
        '''Sort a .bam file

        Args:
             - bamfile (str): name of a .bam file
             - sortedprefix (str): prefix of the sorted .bam to write (.bam will be appended to your prefix)

        '''
        basename='sort'
        if Parameters().get_param("SAMTOOLS_VERSION") == 0:
            options_value = {'in':bamfile,'out' : sortedprefix}
        elif Parameters().get_param("SAMTOOLS_VERSION") == 1:
            options_value = {'in':bamfile,'out' : sortedprefix+".bam"}
        else:
            raise IOError("Version of samtools not recognized\n")
        order = ['in', 'out']
        self.commandline = self.build_command_line(basename, options_value, order)

        if Parameters().get_param('VERBOSE'):
            self.print_command_line()

        p=self.run()
        if Parameters().get_param('VERBOSE'):
            sys.stdout.write(p.stdout.read().decode('utf8')+'\n')
            sys.stderr.write(p.stderr.read().decode('utf8')+'\n')
        else: # have to read it to execute the code
            p.stdout.read().decode('utf8')
            p.stderr.read().decode('utf8')

        return

[docs]    def index(self, sortedfile):
        '''Index a sorted .bam file

        Args:
             - sortedfile (str): name of a sorted .bam file

        '''
        basename='index'
        options_value = {'in':sortedfile}
        order = ['in']
        self.commandline = self.build_command_line(basename, options_value, order)

        if Parameters().get_param('VERBOSE'):
            self.print_command_line()

        p=self.run()
        if Parameters().get_param('VERBOSE'):
            sys.stdout.write(p.stdout.read().decode('utf8')+'\n')
            sys.stderr.write(p.stderr.read().decode('utf8')+'\n')
        else: # have to read it to execute the code
            p.stdout.read().decode('utf8')
            p.stderr.read().decode('utf8')
        return

[docs]    def faidx(self, fastafile):
        '''Index the fasta file so it can be used by mpileup/sort/index as a reference

        Args:
             - fastafile (str): a fasta file name

        '''
        basename='faidx'
        options_value = {'in':fastafile}
        order = ['in']
        self.commandline = self.build_command_line(basename, options_value, order)

        if Parameters().get_param('VERBOSE'):
            self.print_command_line()

        p=self.run()
        if Parameters().get_param('VERBOSE'):
            sys.stdout.write(p.stdout.read().decode('utf8')+'\n')
            sys.stderr.write(p.stderr.read().decode('utf8')+'\n')
        else: # have to read it to execute the code
            p.stdout.read().decode('utf8')
            p.stderr.read().decode('utf8')

        return

[docs]    def mpileup(self, reffile, samplefile, paired, pilefile):
        '''Make the pileup file containing all SNP and reads data.

        Args:
             - reffile (str): the name of the indexed fasta file used as reference
             - samplefile (str): the name of a file containing the list of the sorted .bam files to pileup (one file by line, grouped by organism, child always first)
             - paired (bool): a boolean set to True if at least one organism has paired end reads
             - pilefile (str): the name of the pileup file to write

        '''
        basename = 'mpileup'
        options_value = {'noBAQ':True, 'paired':paired, 'min_quality': 0 , 'max_coverage': 1000000, 'ref': reffile, 'sample_file':samplefile}
        order = ['noBAQ', 'paired', 'min_quality', 'max_coverage', 'ref', 'sample_file']
        self.commandline = self.build_command_line(basename, options_value, order)
        if Parameters().get_param('VERBOSE'):
            self.print_command_line()

        p=self.run()
        #in the case of mpileup, we redirect the std entry to a file (we could avoid that and just feed it to pileup reader. However, the user could have another use for the .pileup file)
        handle = open(pilefile, 'w')
        handle.write(p.stdout.read().decode('utf8'))
        handle.close()
        if Parameters().get_param('VERBOSE'):
            sys.stderr.write(p.stderr.read().decode('utf8')+'\n')
        else:
            p.stderr.read().decode('utf8')

        return

[docs]    def prep_sam_file(self, samfile, outdir):
        '''Convert to .bam, sort and index the file

        Args:
             - samfile (str): a .sam file name
             - outdir (str): the path of the directory where the files will be written

        '''
        fileprefix = outdir
        if fileprefix[-1]!=os.sep:
            fileprefix += os.sep #we complete (if need be) the name of the out directory
        fileprefix += samfile.rpartition('.sam')[0].rpartition(os.sep)[2] #we append the general prefix of the file name

        #converting to bam file
        bamfile = fileprefix+'.bam'
        self.samtobam(samfile, bamfile)
        #sorting
        self.sort(bamfile, fileprefix+'.sorted')
        #indexing
        sortedfile = fileprefix+'.sorted'+'.bam'
        self.index(sortedfile)
        return


[docs]    def pipeline_sequential(self, lfile, outdir):#destined to be given to a Thread
        '''Treat sequentially a list of files

        Args:
             - lfile (list): a list of .sam filenames (ordered by organism and sample)
             - outdir (str): the path of the directory where the files will be written
        '''
        for f in lfile:
            self.prep_sam_file(f, outdir)
        return

[docs]    def pipeline(self, reffile, llfile, outdir, nb_thread, pilename):
        '''Uses the allowed threads to convert/sort/index the different sam files.
        Execute then the mpileup command to create the pileup.

        Args:
             - reffile (str): the name of the reference file (.fasta)
             - llfile (list): a list of list of file (ordered by organism and sample)
             - outdir (str): the path of the directory where the files will be written
             - nb_thread (int): the number of threads allowed
             - pilename (str): the name of the pileupfile to create

        Returns:
             - str. name of the .pileup file

        '''
        #first, we index the reference file
        self.faidx(reffile)
        #we begin by establishing of the name of the final
        lsortedname = list()
        for lfile in llfile:
            for samfile in lfile:
                fileprefix = outdir
                if fileprefix[-1]!=os.sep:
                    fileprefix += os.sep #we complete (if need be) the name of the out directory
                fileprefix = fileprefix + samfile.rpartition('.sam')[0].rpartition(os.sep)[2] + '.sorted.bam'
                lsortedname.append(fileprefix)

        if nb_thread<len(llfile):#less threads than organisms: we treat each file in a sequence
            lfile=list()
            for l in llfile:
                lfile += l
            self.pipeline_sequential(lfile, outdir)
        else:
            if nb_thread>=sum([len(x) for x in llfile]): #at least a thread by file:we treat each file independently
                l=list()
                for lfile in llfile:
                    for f in lfile:
                        l.append([f])
                llfile = l #we replace the list of list by a list containing all file in a separate list

            #we can create the Thread strcture
            lthread=list()
            for lfile in llfile: #for each list of file
                lthread.append(Thread(target=self.pipeline_sequential, args=(lfile, outdir,))) #creation of the thread
                lthread[-1].daemon = True #just in case, the thread will be killed if HyLiTE is killed
                lthread[-1].start()

            #we now wait for all our threads to be finished
            for t in lthread:
                t.join() #not very elegant, but simple and efficient
        #by this point, every file has been correctly converted
        #we now create the file containing the name of the samples
        sample_file = pilename.rpartition('.pileup')[0] + '.txt' #same name as the pileup file, different suffixe
        fsample = open(sample_file, 'w')
        for sname in lsortedname:
            fsample.write(sname+'\n')
        fsample.close()
        #we can now launch the mpileup command
        self.mpileup(reffile, sample_file, True, pilename) #by default, we set paired to true, this have no effect on unpaired reads
        return pilename