Source code for hylite.Hylite

#!/usr/bin/env python3

#    (c) Copyright 2013-2018 Murray Cox, Wandrille Duchemin, Pierre-Yves Dupont.
#
#
#    This file is part of HyLiTE.
#
#    HyLiTE is a free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License version 3 as published by
#    the Free Software Foundation.
#
#    HyLiTE is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with HyLiTE.  If not, see <http://www.gnu.org/licenses/>


#===============================#
# author: Wandrille Duchemin    #
#         Murray Cox            #
# last modified: 9 January 2018 #
#===============================#


from .Parameters import Parameters

from .Protocol_Reader import Protocol_Reader
from .Snp_Generator import Snp_Generator
from .Output_manager import Output_manager
from .samtools_wrapper import samtools_wrapper
from .bowtie2_wrapper import bowtie2_wrapper

import sys
import os
import pickle


[docs]class Hylite:
    '''
    Main class of HyLiTE.
    Coordinate all the different analysis and ensure proper output of the results
    
    Attributes:
          - lorg (list): list of organism. THE CHILD ORGANISM IS ALWAYS THE FIRST OF THE LIST
          - referencefile (str): name of the .fasta file containing the reference sequences/gene model for the analysis
          - referencebase (str): name of the .bt2 index
          - lsnp (list): list of SNP objects, modeling the repartition of snps between different organisms
          - dread (dict):dict (keys are sample name) of list of the child's Read, containing where they align on the reference model, their snp content, and their parental origin
          - snp_gen (Snp_Generator)
          - bt2_wrap (bowtie2_wrapper)
          - samtls_wrap (samtools_wrapper)
          - out_manager (Output_manager)
          - picklingfile (str): name of the backup pickle file
          - name (str): analysis name
          - outdir (str): name of the directory in which the files will be written
          - nb_nodes (int): number of nodes allowed for the job
          - phred64_quality (boolean): boolean set to True if the quality of the readfiles is in phred64 quality
          - pilefile (str): name of the .pileup file
          - last_gene (str): id of the last written gene before the last pickle
    '''
    def __init__(self, name, outdir, nb_threads, phred64_quality):
        '''
        Main class of HyLiTE.
        Coordinate all the different analysis and ensure proper output of the results
        
        Args:
             - name (str): name of the job
             - outdir (str): name of the directory to write outputs in
             - nb_threads (int)
             - phred64_quality (bool)
        '''
        
        self.name = name
        self.outdir = outdir
        
        self.nb_nodes = nb_threads
        self.phred64_quality = phred64_quality
        
        
        self.lorg=list() #list of organism. THE CHILD ORGANISM IS ALWAYS THE FIRST OF THE LIST
        self.referencefile=str() #name of the .fasta file containing the reference sequences/gene model for the analysis
        self.lsnp = list() #list of SNP objects, modeling the repartition of snps between different organisms
        self.dread = dict() #dict (keys are sample name) of list of the child's Read, containing where they align on the reference model, their snp content, and their parental origin
        #snp_gen is generated later: when the pileup file is done
        self.bt2_wrap = bowtie2_wrapper()
        self.samtls_wrap = samtools_wrapper()
        self.out_manager = Output_manager()
        self.picklingfile = self.outdir +self.name + ".pickle"

[docs]    def set_referencefile(self, referencefile):
        '''Set the reference file in the current HyLiTE instance as well as in the outputmanager.
        The output manager needs it to know which genes to write in the output files

        '''
        self.referencefile = referencefile
        self.out_manager.referencefile = referencefile

[docs]    def add_organism_list(self, filename, sam):
        '''Add the list of organism to HyLiTE from a file given as argument
        
        Args:
             - filename (str): the name of the file containing the organism informations
             - sam (bool): set to True if the protocol file contains in fact the .sam files
         
        FORMAT of the file:
         - without header
         - separated by '\t'
         - one line per sample per organism
         - SAMPLE_TYPE is 'RNAseq' or 'gDNA' (if other,  DEFAULT_SAMPLE_TYPE will be put instead)
         - THE CHILD MUST BE THE FIRST ORGANISM
         
        ORGANISM_NAME    PLOIDY    SAMPLE_NAME    SAMPLE_TYPE    READ_FILE.fastq
        '''
        protocol_read = Protocol_Reader(filename)
        self.lorg = protocol_read.read(sam)
        #important update of the reads
        for sample in self.lorg[0].sample_name:
            self.dread[sample] = list()
        protocol_read.close()
        
        return


[docs]    def create_index(self):
        '''
        Create the index for the mapping
        '''
        self.referencebase  = self.outdir + self.referencefile.rpartition('.')[0].rpartition(os.sep)[2] #name of the base: prefix of the reference .fasta file 
        
        self.bt2_wrap.build_index(self.referencefile, self.referencebase)
        
        return
    
[docs]    def mapping(self):
        '''
        Proceed to the mapping of every organism and every sample
        '''
        
        for org in self.lorg: #for each organism
            for sample in org.sample_name:
                samsuf = ''
                if sample != "":
                    samsuf = '.'+sample 
                outname = self.outdir + org.name + samsuf + '.sam' # we create the name of the .sam file
                
                self.bt2_wrap.mappe_versus(self.referencebase, org.dreadfile[sample], (len(org.dreadfile[sample]) == 2), Parameters().get_param('MISMATCH_DEFAULT'), self.nb_nodes, self.phred64_quality, outname)#mapping operation
        
                org.dsamfile[sample] = outname
        
        return
    
[docs]    def samtools_pipeline(self):
        '''
        Proceed to the treatment of the different .sam file in order to obtain the .pileup file
        '''
        pilename = self.outdir + self.name + '.pileup' #the name of the pileup file is the name of the analysis
        
        llfile = [[org.dsamfile[sample] for sample in org.sample_name] for org in self.lorg] #Contains the list of the list of the .sam file of the organisms
        self.pilefile = self.samtls_wrap.pipeline(self.referencefile, llfile, self.outdir, self.nb_nodes, pilename)
    
        return
    
[docs]    def pileup_init(self):
        '''This function initialize the Snp_generator
        
        '''
        self.snp_gen = Snp_Generator(self.pilefile, self.lsnp, self.lorg, self.dread)
        
        return
        
[docs]    def pileup_read(self):
        '''
        This function execute the main algorithm of HyLiTE: it will call the SNPs for each position and then assign a category to each of the child's read
        this function also check the need for pickling
        '''
        last_pickle = 0 #number of analysed genes since the last pickling
        last_write = 0 #number of analysed genes since the last writing
        
        self.last_gene = None
        
        lim_pickle = Parameters().get_param('GENE_BETWEEN_PICKLE')
        lim_write = Parameters().get_param('GENE_BETWEEN_WRITING')
        
        for l in self.snp_gen.give_line():
            if l.split('\t')[0] != self.snp_gen.current_gene: #new gene
                last_pickle+=1
                last_write+=1
                if Parameters().get_param('VERBOSE'):
                    sys.stdout.write(l.split('\t')[0]+'\n') #write the new gene id
                if last_write >= lim_write: #it's time to write the data
                    
                    last_write = 0
                    
                    self.output(False) #writing in mode 'a'
                    self.last_gene = self.snp_gen.current_gene #stocking the last written gene
                    self.clear()
            
            self.snp_gen.treat_pileup_line(l)
            
            if last_pickle >= lim_pickle:#time to pickle
                last_pickle = 0
                self.pickle()
        self.out_manager.precompute_expression_data_towrite(self.lorg, False)
        self.out_manager.write_complete_expression_file(self.outdir, self.name)
        return
        
[docs]    def pickle(self):
        '''
        This function pickle the Hylite instance
        '''
        
        f=open(self.picklingfile+'.tmp', 'wb')
        pickle.dump(self, f)
        f.close()
        
        os.rename(self.picklingfile+'.tmp', self.picklingfile) #we replace the old file by the new
        
        return
    
[docs]    def restaure(self):
        '''
        This function is used to start the SNP calling again from the last gene completed.
        It is used after an unpickling of the Hylite instance
        '''
        
        sys.stdout.write('restauring previous HyLiTE analysis...\n')
        #before, we must close the pileup file
        self.snp_gen.close_pileup()
        #then we must restart it
        self.snp_gen.restaure()

        #restauring the results files:
        self.truncate_gene(self.outdir + self.name + '.snp.txt', self.last_gene) #snp file
        for sample in self.lorg[0].sample_name:
            self.truncate_gene(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.txt', self.last_gene)#read files
        self.truncate_gene(self.outdir + self.name + '.expression.txt', self.last_gene)#expression file
        
        for l in self.snp_gen.give_line():
            if l.split('\t')[0] == self.snp_gen.current_gene: #we are at the right position
                #self.snp_gen.treat_pileup_line(l) #this line has alreadey been read ...
                if Parameters().get_param('VERBOSE'):
                    sys.stdout.write('restarting at gene'+str(l.split('\t')[0])+', position'+str(l.split('\t')[1])+"\n")
                    sys.stdout.write('opened reads:\t'+ str(self.snp_gen.get_nb_reads())+'\n')
                    break
#         we are ready to move along...
        
        self.pileup_read()

        return
[docs]    def output(self, header):
        '''
        Output the different results of HyLiTE

        Args:
             - header (bool): True if the header of the file must be written
        '''
        mode = 'a'
        if header:#if the header is specified, the file are created
            mode = 'w'
        #we begin by writing the snps
        handle = open(self.outdir + self.name + '.snp.txt', mode)
        self.out_manager.write_snp(handle, self.lsnp, [o.name for o in self.lorg], header)
        
        #writing reads
        #reads are written by sample
        for sample in self.lorg[0].sample_name:
            handle = open(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.txt', mode)
            self.out_manager.write_read(handle, self.dread[sample], header, self.lorg)
            
        #write expression
        self.out_manager.precompute_expression_data_towrite(self.lorg, header)
        return
    
[docs]    def output_summary(self):
        '''This function generate the summaries for the reads and the snps
         
        '''
        #We begin by the snps:
        handle_in = open(self.outdir + self.name + '.snp.txt', 'r')
        handle_out = open(self.outdir + self.name + '.snp.summary.txt', 'w')
        self.out_manager.write_snp_summary(handle_in, handle_out)
    
        #now the reads
        for sample in self.lorg[0].sample_name:
            handle_in = open(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.txt', 'r')
            handle_out = open(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.summary.txt', 'w')
            self.out_manager.write_read_summary(handle_in, handle_out, self.lorg)


        #output the run's summary
        handle_out = open(self.outdir + self.name + '.run.summary.txt', 'w')
        lhandle_in_read = []#list of handles for each sample read.summary.txt
        for sample in self.lorg[0].sample_name:
            lhandle_in_read.append(open(self.outdir + self.name +'.'+ self.lorg[0].name+'.'+ sample +'.read.summary.txt', 'r'))
        handle_in_snp = open(self.outdir + self.name + '.snp.summary.txt', 'r')

        self.out_manager.write_run_summary(handle_out, lhandle_in_read, handle_in_snp, self.lorg)#write a basic summary of the tun

        return

[docs]    def clear(self):
        '''This function basically delete all the gene related informations
        '''
        #Reads deletion
        for s in list(self.dread.keys()):#for each sample
            self.dread[s] = list() #reinitialisation
        #SNP deletion
        while len(self.lsnp)>0:
            self.lsnp.pop()
        
        #Expression data and finger print deletion
        for org in self.lorg:
            org.clear()
        return
      
[docs]    def truncate_gene(self, filename, last_gene):
        '''This function allow to truncate a result file after a specific gene.
        It is useful when unpickling an ancient HyLiTE instance. (desynchronisation between the writing of results and pickling)

        Args:
             - filename (str): name of the file to truncate
             - last_gene (str): id of the last gene before the pickling took place
        
        '''
        handle = open(filename, 'r+')
    
        handle.seek(0, os.SEEK_END) #we go at the end of the file
        pos = handle.tell() -1
    
        handle.seek(pos, os.SEEK_SET)
        #first we ignore the firsts '\n' and blank lines
        while handle.read(1)=='\n':
            pos-=1
            handle.seek(pos, os.SEEK_SET)
        sys.stdout.write('no "\\n"'+str(pos)+'\n')
        
        #now we go back to the next line end
        while pos>0:
            pos-=1
            handle.seek(pos, os.SEEK_SET)
            if handle.read(1)=='\n': #we are a the end of a new line
    #            pos+=1 #we go forward one character
    #            handle.seek(pos,os.SEEK_SET)
                if handle.read(len(last_gene)) == last_gene: #we extract the gene id and compare it to ours
                    break
        if pos<0:
            sys.stdout.write(last_gene+'not found\n')
            handle.close()
            return
    
        #we have found the last line displaying our last_gene, we want to truncate after that line
        while handle.read(1)!='\n':
            pos+=1
            handle.seek(pos, os.SEEK_SET)
    
        sys.stdout.write('truncating at position'+str(pos+1)+'\n')
        handle.seek(pos+1, os.SEEK_SET)# +1 to conserve the '\n'
        handle.truncate()    
        handle.close()
  
        return