#!/usr/bin/env python3
# (c) Copyright 2013-2018 Murray Cox, Wandrille Duchemin, Pierre-Yves Dupont.
#
#
# This file is part of HyLiTE.
#
# HyLiTE is a free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3 as published by
# the Free Software Foundation.
#
# HyLiTE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HyLiTE. If not, see <http://www.gnu.org/licenses/>
#===============================#
# author: Wandrille Duchemin #
# Murray Cox #
# last modified: 9 January 2018 #
#===============================#
from .Parameters import Parameters
from .Protocol_Reader import Protocol_Reader
from .Snp_Generator import Snp_Generator
from .Output_manager import Output_manager
from .samtools_wrapper import samtools_wrapper
from .bowtie2_wrapper import bowtie2_wrapper
import sys
import os
import pickle
[docs]class Hylite:
'''
Main class of HyLiTE.
Coordinate all the different analysis and ensure proper output of the results
Attributes:
- lorg (list): list of organism. THE CHILD ORGANISM IS ALWAYS THE FIRST OF THE LIST
- referencefile (str): name of the .fasta file containing the reference sequences/gene model for the analysis
- referencebase (str): name of the .bt2 index
- lsnp (list): list of SNP objects, modeling the repartition of snps between different organisms
- dread (dict):dict (keys are sample name) of list of the child's Read, containing where they align on the reference model, their snp content, and their parental origin
- snp_gen (Snp_Generator)
- bt2_wrap (bowtie2_wrapper)
- samtls_wrap (samtools_wrapper)
- out_manager (Output_manager)
- picklingfile (str): name of the backup pickle file
- name (str): analysis name
- outdir (str): name of the directory in which the files will be written
- nb_nodes (int): number of nodes allowed for the job
- phred64_quality (boolean): boolean set to True if the quality of the readfiles is in phred64 quality
- pilefile (str): name of the .pileup file
- last_gene (str): id of the last written gene before the last pickle
'''
def __init__(self, name, outdir, nb_threads, phred64_quality):
'''
Main class of HyLiTE.
Coordinate all the different analysis and ensure proper output of the results
Args:
- name (str): name of the job
- outdir (str): name of the directory to write outputs in
- nb_threads (int)
- phred64_quality (bool)
'''
self.name = name
self.outdir = outdir
self.nb_nodes = nb_threads
self.phred64_quality = phred64_quality
self.lorg=list() #list of organism. THE CHILD ORGANISM IS ALWAYS THE FIRST OF THE LIST
self.referencefile=str() #name of the .fasta file containing the reference sequences/gene model for the analysis
self.lsnp = list() #list of SNP objects, modeling the repartition of snps between different organisms
self.dread = dict() #dict (keys are sample name) of list of the child's Read, containing where they align on the reference model, their snp content, and their parental origin
#snp_gen is generated later: when the pileup file is done
self.bt2_wrap = bowtie2_wrapper()
self.samtls_wrap = samtools_wrapper()
self.out_manager = Output_manager()
self.picklingfile = self.outdir +self.name + ".pickle"
[docs] def set_referencefile(self, referencefile):
'''Set the reference file in the current HyLiTE instance as well as in the outputmanager.
The output manager needs it to know which genes to write in the output files
'''
self.referencefile = referencefile
self.out_manager.referencefile = referencefile
[docs] def add_organism_list(self, filename, sam):
'''Add the list of organism to HyLiTE from a file given as argument
Args:
- filename (str): the name of the file containing the organism informations
- sam (bool): set to True if the protocol file contains in fact the .sam files
FORMAT of the file:
- without header
- separated by '\t'
- one line per sample per organism
- SAMPLE_TYPE is 'RNAseq' or 'gDNA' (if other, DEFAULT_SAMPLE_TYPE will be put instead)
- THE CHILD MUST BE THE FIRST ORGANISM
ORGANISM_NAME PLOIDY SAMPLE_NAME SAMPLE_TYPE READ_FILE.fastq
'''
protocol_read = Protocol_Reader(filename)
self.lorg = protocol_read.read(sam)
#important update of the reads
for sample in self.lorg[0].sample_name:
self.dread[sample] = list()
protocol_read.close()
return
[docs] def create_index(self):
'''
Create the index for the mapping
'''
self.referencebase = self.outdir + self.referencefile.rpartition('.')[0].rpartition(os.sep)[2] #name of the base: prefix of the reference .fasta file
self.bt2_wrap.build_index(self.referencefile, self.referencebase)
return
[docs] def mapping(self):
'''
Proceed to the mapping of every organism and every sample
'''
for org in self.lorg: #for each organism
for sample in org.sample_name:
samsuf = ''
if sample != "":
samsuf = '.'+sample
outname = self.outdir + org.name + samsuf + '.sam' # we create the name of the .sam file
self.bt2_wrap.mappe_versus(self.referencebase, org.dreadfile[sample], (len(org.dreadfile[sample]) == 2), Parameters().get_param('MISMATCH_DEFAULT'), self.nb_nodes, self.phred64_quality, outname)#mapping operation
org.dsamfile[sample] = outname
return
[docs] def pileup_init(self):
'''This function initialize the Snp_generator
'''
self.snp_gen = Snp_Generator(self.pilefile, self.lsnp, self.lorg, self.dread)
return
[docs] def pileup_read(self):
'''
This function execute the main algorithm of HyLiTE: it will call the SNPs for each position and then assign a category to each of the child's read
this function also check the need for pickling
'''
last_pickle = 0 #number of analysed genes since the last pickling
last_write = 0 #number of analysed genes since the last writing
self.last_gene = None
lim_pickle = Parameters().get_param('GENE_BETWEEN_PICKLE')
lim_write = Parameters().get_param('GENE_BETWEEN_WRITING')
for l in self.snp_gen.give_line():
if l.split('\t')[0] != self.snp_gen.current_gene: #new gene
last_pickle+=1
last_write+=1
if Parameters().get_param('VERBOSE'):
sys.stdout.write(l.split('\t')[0]+'\n') #write the new gene id
if last_write >= lim_write: #it's time to write the data
last_write = 0
self.output(False) #writing in mode 'a'
self.last_gene = self.snp_gen.current_gene #stocking the last written gene
self.clear()
self.snp_gen.treat_pileup_line(l)
if last_pickle >= lim_pickle:#time to pickle
last_pickle = 0
self.pickle()
self.out_manager.precompute_expression_data_towrite(self.lorg, False)
self.out_manager.write_complete_expression_file(self.outdir, self.name)
return
[docs] def pickle(self):
'''
This function pickle the Hylite instance
'''
f=open(self.picklingfile+'.tmp', 'wb')
pickle.dump(self, f)
f.close()
os.rename(self.picklingfile+'.tmp', self.picklingfile) #we replace the old file by the new
return
[docs] def restaure(self):
'''
This function is used to start the SNP calling again from the last gene completed.
It is used after an unpickling of the Hylite instance
'''
sys.stdout.write('restauring previous HyLiTE analysis...\n')
#before, we must close the pileup file
self.snp_gen.close_pileup()
#then we must restart it
self.snp_gen.restaure()
#restauring the results files:
self.truncate_gene(self.outdir + self.name + '.snp.txt', self.last_gene) #snp file
for sample in self.lorg[0].sample_name:
self.truncate_gene(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.txt', self.last_gene)#read files
self.truncate_gene(self.outdir + self.name + '.expression.txt', self.last_gene)#expression file
for l in self.snp_gen.give_line():
if l.split('\t')[0] == self.snp_gen.current_gene: #we are at the right position
#self.snp_gen.treat_pileup_line(l) #this line has alreadey been read ...
if Parameters().get_param('VERBOSE'):
sys.stdout.write('restarting at gene'+str(l.split('\t')[0])+', position'+str(l.split('\t')[1])+"\n")
sys.stdout.write('opened reads:\t'+ str(self.snp_gen.get_nb_reads())+'\n')
break
# we are ready to move along...
self.pileup_read()
return
[docs] def output(self, header):
'''
Output the different results of HyLiTE
Args:
- header (bool): True if the header of the file must be written
'''
mode = 'a'
if header:#if the header is specified, the file are created
mode = 'w'
#we begin by writing the snps
handle = open(self.outdir + self.name + '.snp.txt', mode)
self.out_manager.write_snp(handle, self.lsnp, [o.name for o in self.lorg], header)
#writing reads
#reads are written by sample
for sample in self.lorg[0].sample_name:
handle = open(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.txt', mode)
self.out_manager.write_read(handle, self.dread[sample], header, self.lorg)
#write expression
self.out_manager.precompute_expression_data_towrite(self.lorg, header)
return
[docs] def output_summary(self):
'''This function generate the summaries for the reads and the snps
'''
#We begin by the snps:
handle_in = open(self.outdir + self.name + '.snp.txt', 'r')
handle_out = open(self.outdir + self.name + '.snp.summary.txt', 'w')
self.out_manager.write_snp_summary(handle_in, handle_out)
#now the reads
for sample in self.lorg[0].sample_name:
handle_in = open(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.txt', 'r')
handle_out = open(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.summary.txt', 'w')
self.out_manager.write_read_summary(handle_in, handle_out, self.lorg)
#output the run's summary
handle_out = open(self.outdir + self.name + '.run.summary.txt', 'w')
lhandle_in_read = []#list of handles for each sample read.summary.txt
for sample in self.lorg[0].sample_name:
lhandle_in_read.append(open(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.summary.txt', 'r'))
handle_in_snp = open(self.outdir + self.name + '.snp.summary.txt', 'r')
self.out_manager.write_run_summary(handle_out, lhandle_in_read, handle_in_snp, self.lorg)#write a basic summary of the tun
return
[docs] def clear(self):
'''This function basically delete all the gene related informations
'''
#Reads deletion
for s in list(self.dread.keys()):#for each sample
self.dread[s] = list() #reinitialisation
#SNP deletion
while len(self.lsnp)>0:
self.lsnp.pop()
#Expression data and finger print deletion
for org in self.lorg:
org.clear()
return
[docs] def truncate_gene(self, filename, last_gene):
'''This function allow to truncate a result file after a specific gene.
It is useful when unpickling an ancient HyLiTE instance. (desynchronisation between the writing of results and pickling)
Args:
- filename (str): name of the file to truncate
- last_gene (str): id of the last gene before the pickling took place
'''
handle = open(filename, 'r+')
handle.seek(0, os.SEEK_END) #we go at the end of the file
pos = handle.tell() -1
handle.seek(pos, os.SEEK_SET)
#first we ignore the firsts '\n' and blank lines
while handle.read(1)=='\n':
pos-=1
handle.seek(pos, os.SEEK_SET)
sys.stdout.write('no "\\n"'+str(pos)+'\n')
#now we go back to the next line end
while pos>0:
pos-=1
handle.seek(pos, os.SEEK_SET)
if handle.read(1)=='\n': #we are a the end of a new line
# pos+=1 #we go forward one character
# handle.seek(pos,os.SEEK_SET)
if handle.read(len(last_gene)) == last_gene: #we extract the gene id and compare it to ours
break
if pos<0:
sys.stdout.write(last_gene+'not found\n')
handle.close()
return
#we have found the last line displaying our last_gene, we want to truncate after that line
while handle.read(1)!='\n':
pos+=1
handle.seek(pos, os.SEEK_SET)
sys.stdout.write('truncating at position'+str(pos+1)+'\n')
handle.seek(pos+1, os.SEEK_SET)# +1 to conserve the '\n'
handle.truncate()
handle.close()
return