Source code for hylite.Hylite

#!/usr/bin/env python3

#    (c) Copyright 2013-2018 Murray Cox, Wandrille Duchemin, Pierre-Yves Dupont.
#
#
#    This file is part of HyLiTE.
#
#    HyLiTE is a free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License version 3 as published by
#    the Free Software Foundation.
#
#    HyLiTE is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with HyLiTE.  If not, see <http://www.gnu.org/licenses/>


#===============================#
# author: Wandrille Duchemin    #
#         Murray Cox            #
# last modified: 9 January 2018 #
#===============================#


from .Parameters import Parameters

from .Protocol_Reader import Protocol_Reader
from .Snp_Generator import Snp_Generator
from .Output_manager import Output_manager
from .samtools_wrapper import samtools_wrapper
from .bowtie2_wrapper import bowtie2_wrapper

import sys
import os
import pickle


[docs]class Hylite: ''' Main class of HyLiTE. Coordinate all the different analysis and ensure proper output of the results Attributes: - lorg (list): list of organism. THE CHILD ORGANISM IS ALWAYS THE FIRST OF THE LIST - referencefile (str): name of the .fasta file containing the reference sequences/gene model for the analysis - referencebase (str): name of the .bt2 index - lsnp (list): list of SNP objects, modeling the repartition of snps between different organisms - dread (dict):dict (keys are sample name) of list of the child's Read, containing where they align on the reference model, their snp content, and their parental origin - snp_gen (Snp_Generator) - bt2_wrap (bowtie2_wrapper) - samtls_wrap (samtools_wrapper) - out_manager (Output_manager) - picklingfile (str): name of the backup pickle file - name (str): analysis name - outdir (str): name of the directory in which the files will be written - nb_nodes (int): number of nodes allowed for the job - phred64_quality (boolean): boolean set to True if the quality of the readfiles is in phred64 quality - pilefile (str): name of the .pileup file - last_gene (str): id of the last written gene before the last pickle ''' def __init__(self, name, outdir, nb_threads, phred64_quality): ''' Main class of HyLiTE. Coordinate all the different analysis and ensure proper output of the results Args: - name (str): name of the job - outdir (str): name of the directory to write outputs in - nb_threads (int) - phred64_quality (bool) ''' self.name = name self.outdir = outdir self.nb_nodes = nb_threads self.phred64_quality = phred64_quality self.lorg=list() #list of organism. THE CHILD ORGANISM IS ALWAYS THE FIRST OF THE LIST self.referencefile=str() #name of the .fasta file containing the reference sequences/gene model for the analysis self.lsnp = list() #list of SNP objects, modeling the repartition of snps between different organisms self.dread = dict() #dict (keys are sample name) of list of the child's Read, containing where they align on the reference model, their snp content, and their parental origin #snp_gen is generated later: when the pileup file is done self.bt2_wrap = bowtie2_wrapper() self.samtls_wrap = samtools_wrapper() self.out_manager = Output_manager() self.picklingfile = self.outdir +self.name + ".pickle"
[docs] def set_referencefile(self, referencefile): '''Set the reference file in the current HyLiTE instance as well as in the outputmanager. The output manager needs it to know which genes to write in the output files ''' self.referencefile = referencefile self.out_manager.referencefile = referencefile
[docs] def add_organism_list(self, filename, sam): '''Add the list of organism to HyLiTE from a file given as argument Args: - filename (str): the name of the file containing the organism informations - sam (bool): set to True if the protocol file contains in fact the .sam files FORMAT of the file: - without header - separated by '\t' - one line per sample per organism - SAMPLE_TYPE is 'RNAseq' or 'gDNA' (if other, DEFAULT_SAMPLE_TYPE will be put instead) - THE CHILD MUST BE THE FIRST ORGANISM ORGANISM_NAME PLOIDY SAMPLE_NAME SAMPLE_TYPE READ_FILE.fastq ''' protocol_read = Protocol_Reader(filename) self.lorg = protocol_read.read(sam) #important update of the reads for sample in self.lorg[0].sample_name: self.dread[sample] = list() protocol_read.close() return
[docs] def create_index(self): ''' Create the index for the mapping ''' self.referencebase = self.outdir + self.referencefile.rpartition('.')[0].rpartition(os.sep)[2] #name of the base: prefix of the reference .fasta file self.bt2_wrap.build_index(self.referencefile, self.referencebase) return
[docs] def mapping(self): ''' Proceed to the mapping of every organism and every sample ''' for org in self.lorg: #for each organism for sample in org.sample_name: samsuf = '' if sample != "": samsuf = '.'+sample outname = self.outdir + org.name + samsuf + '.sam' # we create the name of the .sam file self.bt2_wrap.mappe_versus(self.referencebase, org.dreadfile[sample], (len(org.dreadfile[sample]) == 2), Parameters().get_param('MISMATCH_DEFAULT'), self.nb_nodes, self.phred64_quality, outname)#mapping operation org.dsamfile[sample] = outname return
[docs] def samtools_pipeline(self): ''' Proceed to the treatment of the different .sam file in order to obtain the .pileup file ''' pilename = self.outdir + self.name + '.pileup' #the name of the pileup file is the name of the analysis llfile = [[org.dsamfile[sample] for sample in org.sample_name] for org in self.lorg] #Contains the list of the list of the .sam file of the organisms self.pilefile = self.samtls_wrap.pipeline(self.referencefile, llfile, self.outdir, self.nb_nodes, pilename) return
[docs] def pileup_init(self): '''This function initialize the Snp_generator ''' self.snp_gen = Snp_Generator(self.pilefile, self.lsnp, self.lorg, self.dread) return
[docs] def pileup_read(self): ''' This function execute the main algorithm of HyLiTE: it will call the SNPs for each position and then assign a category to each of the child's read this function also check the need for pickling ''' last_pickle = 0 #number of analysed genes since the last pickling last_write = 0 #number of analysed genes since the last writing self.last_gene = None lim_pickle = Parameters().get_param('GENE_BETWEEN_PICKLE') lim_write = Parameters().get_param('GENE_BETWEEN_WRITING') for l in self.snp_gen.give_line(): if l.split('\t')[0] != self.snp_gen.current_gene: #new gene last_pickle+=1 last_write+=1 if Parameters().get_param('VERBOSE'): sys.stdout.write(l.split('\t')[0]+'\n') #write the new gene id if last_write >= lim_write: #it's time to write the data last_write = 0 self.output(False) #writing in mode 'a' self.last_gene = self.snp_gen.current_gene #stocking the last written gene self.clear() self.snp_gen.treat_pileup_line(l) if last_pickle >= lim_pickle:#time to pickle last_pickle = 0 self.pickle() self.out_manager.precompute_expression_data_towrite(self.lorg, False) self.out_manager.write_complete_expression_file(self.outdir, self.name) return
[docs] def pickle(self): ''' This function pickle the Hylite instance ''' f=open(self.picklingfile+'.tmp', 'wb') pickle.dump(self, f) f.close() os.rename(self.picklingfile+'.tmp', self.picklingfile) #we replace the old file by the new return
[docs] def restaure(self): ''' This function is used to start the SNP calling again from the last gene completed. It is used after an unpickling of the Hylite instance ''' sys.stdout.write('restauring previous HyLiTE analysis...\n') #before, we must close the pileup file self.snp_gen.close_pileup() #then we must restart it self.snp_gen.restaure() #restauring the results files: self.truncate_gene(self.outdir + self.name + '.snp.txt', self.last_gene) #snp file for sample in self.lorg[0].sample_name: self.truncate_gene(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.txt', self.last_gene)#read files self.truncate_gene(self.outdir + self.name + '.expression.txt', self.last_gene)#expression file for l in self.snp_gen.give_line(): if l.split('\t')[0] == self.snp_gen.current_gene: #we are at the right position #self.snp_gen.treat_pileup_line(l) #this line has alreadey been read ... if Parameters().get_param('VERBOSE'): sys.stdout.write('restarting at gene'+str(l.split('\t')[0])+', position'+str(l.split('\t')[1])+"\n") sys.stdout.write('opened reads:\t'+ str(self.snp_gen.get_nb_reads())+'\n') break # we are ready to move along... self.pileup_read() return
[docs] def output(self, header): ''' Output the different results of HyLiTE Args: - header (bool): True if the header of the file must be written ''' mode = 'a' if header:#if the header is specified, the file are created mode = 'w' #we begin by writing the snps handle = open(self.outdir + self.name + '.snp.txt', mode) self.out_manager.write_snp(handle, self.lsnp, [o.name for o in self.lorg], header) #writing reads #reads are written by sample for sample in self.lorg[0].sample_name: handle = open(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.txt', mode) self.out_manager.write_read(handle, self.dread[sample], header, self.lorg) #write expression self.out_manager.precompute_expression_data_towrite(self.lorg, header) return
[docs] def output_summary(self): '''This function generate the summaries for the reads and the snps ''' #We begin by the snps: handle_in = open(self.outdir + self.name + '.snp.txt', 'r') handle_out = open(self.outdir + self.name + '.snp.summary.txt', 'w') self.out_manager.write_snp_summary(handle_in, handle_out) #now the reads for sample in self.lorg[0].sample_name: handle_in = open(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.txt', 'r') handle_out = open(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.summary.txt', 'w') self.out_manager.write_read_summary(handle_in, handle_out, self.lorg) #output the run's summary handle_out = open(self.outdir + self.name + '.run.summary.txt', 'w') lhandle_in_read = []#list of handles for each sample read.summary.txt for sample in self.lorg[0].sample_name: lhandle_in_read.append(open(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.summary.txt', 'r')) handle_in_snp = open(self.outdir + self.name + '.snp.summary.txt', 'r') self.out_manager.write_run_summary(handle_out, lhandle_in_read, handle_in_snp, self.lorg)#write a basic summary of the tun return
[docs] def clear(self): '''This function basically delete all the gene related informations ''' #Reads deletion for s in list(self.dread.keys()):#for each sample self.dread[s] = list() #reinitialisation #SNP deletion while len(self.lsnp)>0: self.lsnp.pop() #Expression data and finger print deletion for org in self.lorg: org.clear() return
[docs] def truncate_gene(self, filename, last_gene): '''This function allow to truncate a result file after a specific gene. It is useful when unpickling an ancient HyLiTE instance. (desynchronisation between the writing of results and pickling) Args: - filename (str): name of the file to truncate - last_gene (str): id of the last gene before the pickling took place ''' handle = open(filename, 'r+') handle.seek(0, os.SEEK_END) #we go at the end of the file pos = handle.tell() -1 handle.seek(pos, os.SEEK_SET) #first we ignore the firsts '\n' and blank lines while handle.read(1)=='\n': pos-=1 handle.seek(pos, os.SEEK_SET) sys.stdout.write('no "\\n"'+str(pos)+'\n') #now we go back to the next line end while pos>0: pos-=1 handle.seek(pos, os.SEEK_SET) if handle.read(1)=='\n': #we are a the end of a new line # pos+=1 #we go forward one character # handle.seek(pos,os.SEEK_SET) if handle.read(len(last_gene)) == last_gene: #we extract the gene id and compare it to ours break if pos<0: sys.stdout.write(last_gene+'not found\n') handle.close() return #we have found the last line displaying our last_gene, we want to truncate after that line while handle.read(1)!='\n': pos+=1 handle.seek(pos, os.SEEK_SET) sys.stdout.write('truncating at position'+str(pos+1)+'\n') handle.seek(pos+1, os.SEEK_SET)# +1 to conserve the '\n' handle.truncate() handle.close() return