Source code for hylite.samtools_wrapper
#!/usr/bin/env python3
# (c) Copyright 2013-2018 Murray Cox, Wandrille Duchemin, Pierre-Yves Dupont.
#
#
# This file is part of HyLiTE.
#
# HyLiTE is a free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3 as published by
# the Free Software Foundation.
#
# HyLiTE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HyLiTE. If not, see <http://www.gnu.org/licenses/>
#===============================#
# author: Wandrille Duchemin #
# Murray Cox #
# last modified: 9 January 2018 #
#===============================#
from .Parameters import Parameters
from .Generic_wrapper import Generic_wrapper #import os too
import os
from threading import Thread
import sys
[docs]class samtools_wrapper(Generic_wrapper):
'''
A wrapper for samtools
'''
def __init__(self):
'''
A wrapper for samtools
'''
param = Parameters()
Generic_wrapper.__init__(self, param.get_param('SAMTOOLS_PATH'),\
{'view': param.get_param('SAMTOOLS_NAME_VIEW'),\
"sort": param.get_param('SAMTOOLS_NAME_SORT'),\
"index": param.get_param('SAMTOOLS_NAME_INDEX'),\
"mpileup": param.get_param('SAMTOOLS_NAME_MPILEUP'),\
"faidx": param.get_param('SAMTOOLS_NAME_FAIDX') },\
{'in' :param.get_param('SAMTOOLS_OPTION_IN'),\
'view_out': param.get_param('SAMTOOLS_VIEW_OPTION_OUT'),\
'out': param.get_param('SAMTOOLS_OPTION_OUT'),\
'noBAQ': param.get_param('SAMTOOLS_MPILEUP_OPTION_NOBAQ'),\
'min_quality': param.get_param('SAMTOOLS_MPILEUP_OPTION_MINQUAL'),\
'max_coverage': param.get_param('SAMTOOLS_MPILEUP_OPTION_MAXCOV'),\
'ref': param.get_param('SAMTOOLS_MPILEUP_OPTION_REFERENCE'),\
'sample_file': param.get_param('SAMTOOLS_MPILEUP_OPTION_SAMPLE'),\
'paired': param.get_param('SAMTOOLS_MPILEUP_OPTION_PAIRED')})
#this should provide a flexible framework. if an option change, just modify the according variable in params.py
#should you wish to add a new option, add an option to the params.py and add it to the options dictionnary
return
[docs] def samtobam(self, samfile, bamfile):
'''Convert .sam to .bam
Args:
- samfile (str): name of a .sam file
- bamfile (str): name of the .bam file to write
'''
basename = 'view'
options_value = {'in':samfile,'view_out' : bamfile}
order = ['in', 'view_out']
self.commandline = self.build_command_line(basename, options_value, order)
if Parameters().get_param('VERBOSE'):
self.print_command_line()
p=self.run()
if Parameters().get_param('VERBOSE'):
sys.stdout.write(p.stdout.read().decode('utf8')+'\n')
sys.stderr.write(p.stderr.read().decode('utf8')+'\n')
else: # have to read it to execute the code
p.stdout.read().decode('utf8')
p.stderr.read().decode('utf8')
return
[docs] def sort(self, bamfile, sortedprefix):
'''Sort a .bam file
Args:
- bamfile (str): name of a .bam file
- sortedprefix (str): prefix of the sorted .bam to write (.bam will be appended to your prefix)
'''
basename='sort'
if Parameters().get_param("SAMTOOLS_VERSION") == 0:
options_value = {'in':bamfile,'out' : sortedprefix}
elif Parameters().get_param("SAMTOOLS_VERSION") == 1:
options_value = {'in':bamfile,'out' : sortedprefix+".bam"}
else:
raise IOError("Version of samtools not recognized\n")
order = ['in', 'out']
self.commandline = self.build_command_line(basename, options_value, order)
if Parameters().get_param('VERBOSE'):
self.print_command_line()
p=self.run()
if Parameters().get_param('VERBOSE'):
sys.stdout.write(p.stdout.read().decode('utf8')+'\n')
sys.stderr.write(p.stderr.read().decode('utf8')+'\n')
else: # have to read it to execute the code
p.stdout.read().decode('utf8')
p.stderr.read().decode('utf8')
return
[docs] def index(self, sortedfile):
'''Index a sorted .bam file
Args:
- sortedfile (str): name of a sorted .bam file
'''
basename='index'
options_value = {'in':sortedfile}
order = ['in']
self.commandline = self.build_command_line(basename, options_value, order)
if Parameters().get_param('VERBOSE'):
self.print_command_line()
p=self.run()
if Parameters().get_param('VERBOSE'):
sys.stdout.write(p.stdout.read().decode('utf8')+'\n')
sys.stderr.write(p.stderr.read().decode('utf8')+'\n')
else: # have to read it to execute the code
p.stdout.read().decode('utf8')
p.stderr.read().decode('utf8')
return
[docs] def faidx(self, fastafile):
'''Index the fasta file so it can be used by mpileup/sort/index as a reference
Args:
- fastafile (str): a fasta file name
'''
basename='faidx'
options_value = {'in':fastafile}
order = ['in']
self.commandline = self.build_command_line(basename, options_value, order)
if Parameters().get_param('VERBOSE'):
self.print_command_line()
p=self.run()
if Parameters().get_param('VERBOSE'):
sys.stdout.write(p.stdout.read().decode('utf8')+'\n')
sys.stderr.write(p.stderr.read().decode('utf8')+'\n')
else: # have to read it to execute the code
p.stdout.read().decode('utf8')
p.stderr.read().decode('utf8')
return
[docs] def mpileup(self, reffile, samplefile, paired, pilefile):
'''Make the pileup file containing all SNP and reads data.
Args:
- reffile (str): the name of the indexed fasta file used as reference
- samplefile (str): the name of a file containing the list of the sorted .bam files to pileup (one file by line, grouped by organism, child always first)
- paired (bool): a boolean set to True if at least one organism has paired end reads
- pilefile (str): the name of the pileup file to write
'''
basename = 'mpileup'
options_value = {'noBAQ':True, 'paired':paired, 'min_quality': 0 , 'max_coverage': 1000000, 'ref': reffile, 'sample_file':samplefile}
order = ['noBAQ', 'paired', 'min_quality', 'max_coverage', 'ref', 'sample_file']
self.commandline = self.build_command_line(basename, options_value, order)
if Parameters().get_param('VERBOSE'):
self.print_command_line()
p=self.run()
#in the case of mpileup, we redirect the std entry to a file (we could avoid that and just feed it to pileup reader. However, the user could have another use for the .pileup file)
handle = open(pilefile, 'w')
handle.write(p.stdout.read().decode('utf8'))
handle.close()
if Parameters().get_param('VERBOSE'):
sys.stderr.write(p.stderr.read().decode('utf8')+'\n')
else:
p.stderr.read().decode('utf8')
return
[docs] def prep_sam_file(self, samfile, outdir):
'''Convert to .bam, sort and index the file
Args:
- samfile (str): a .sam file name
- outdir (str): the path of the directory where the files will be written
'''
fileprefix = outdir
if fileprefix[-1]!=os.sep:
fileprefix += os.sep #we complete (if need be) the name of the out directory
fileprefix += samfile.rpartition('.sam')[0].rpartition(os.sep)[2] #we append the general prefix of the file name
#converting to bam file
bamfile = fileprefix+'.bam'
self.samtobam(samfile, bamfile)
#sorting
self.sort(bamfile, fileprefix+'.sorted')
#indexing
sortedfile = fileprefix+'.sorted'+'.bam'
self.index(sortedfile)
return
[docs] def pipeline_sequential(self, lfile, outdir):#destined to be given to a Thread
'''Treat sequentially a list of files
Args:
- lfile (list): a list of .sam filenames (ordered by organism and sample)
- outdir (str): the path of the directory where the files will be written
'''
for f in lfile:
self.prep_sam_file(f, outdir)
return
[docs] def pipeline(self, reffile, llfile, outdir, nb_thread, pilename):
'''Uses the allowed threads to convert/sort/index the different sam files.
Execute then the mpileup command to create the pileup.
Args:
- reffile (str): the name of the reference file (.fasta)
- llfile (list): a list of list of file (ordered by organism and sample)
- outdir (str): the path of the directory where the files will be written
- nb_thread (int): the number of threads allowed
- pilename (str): the name of the pileupfile to create
Returns:
- str. name of the .pileup file
'''
#first, we index the reference file
self.faidx(reffile)
#we begin by establishing of the name of the final
lsortedname = list()
for lfile in llfile:
for samfile in lfile:
fileprefix = outdir
if fileprefix[-1]!=os.sep:
fileprefix += os.sep #we complete (if need be) the name of the out directory
fileprefix = fileprefix + samfile.rpartition('.sam')[0].rpartition(os.sep)[2] + '.sorted.bam'
lsortedname.append(fileprefix)
if nb_thread<len(llfile):#less threads than organisms: we treat each file in a sequence
lfile=list()
for l in llfile:
lfile += l
self.pipeline_sequential(lfile, outdir)
else:
if nb_thread>=sum([len(x) for x in llfile]): #at least a thread by file:we treat each file independently
l=list()
for lfile in llfile:
for f in lfile:
l.append([f])
llfile = l #we replace the list of list by a list containing all file in a separate list
#we can create the Thread strcture
lthread=list()
for lfile in llfile: #for each list of file
lthread.append(Thread(target=self.pipeline_sequential, args=(lfile, outdir,))) #creation of the thread
lthread[-1].daemon = True #just in case, the thread will be killed if HyLiTE is killed
lthread[-1].start()
#we now wait for all our threads to be finished
for t in lthread:
t.join() #not very elegant, but simple and efficient
#by this point, every file has been correctly converted
#we now create the file containing the name of the samples
sample_file = pilename.rpartition('.pileup')[0] + '.txt' #same name as the pileup file, different suffixe
fsample = open(sample_file, 'w')
for sname in lsortedname:
fsample.write(sname+'\n')
fsample.close()
#we can now launch the mpileup command
self.mpileup(reffile, sample_file, True, pilename) #by default, we set paired to true, this have no effect on unpaired reads
return pilename