Source code for hylite.Read

#!/usr/bin/env python3

#    (c) Copyright 2013-2018 Murray Cox, Wandrille Duchemin, Pierre-Yves Dupont.
#
#
#    This file is part of HyLiTE.
#
#    HyLiTE is a free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License version 3 as published by
#    the Free Software Foundation.
#
#    HyLiTE is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with HyLiTE.  If not, see <http://www.gnu.org/licenses/>


#===============================#
# author: Wandrille Duchemin    #
#         Murray Cox            #
# last modified: 9 January 2018 #
#===============================#


import itertools

[docs]class Read:
    '''
    Used to represent a read.

    Attributes:
        - organism (str): name of the organism the read belongs to
        - sample (str): name of the sample the Read belong to
        - gene (str): gene id
        - start (int): starting position of the read on the gene
        - stop (int): stopping position of the read on the gene
        - lsnp (list): references to the snps contained in the read
        - N (bool): tag for the presence of child specific snps
        - category (str): give the parents of the reads.
    '''
    
    def __init__(self, organism, sample, gene, start):
        '''Used to represent a read.

        Args:
             - organism (str): name of the organism
             - sample (str): name of the sample
             - gene (str): name of the gene corresponding to the read
             - start (int): starting position of the read
             - tag (int): indicate if the read already has a gene copy assigned. -1 for not assigned, corresponding index otherwise
        '''
        
        self.organism=organism
        self.sample = sample
        self.gene=gene
        self.start=start
        self.stop=None
        
        self.local_snp=[] #list of tuple (snp_index, presence) presence is:-1/0/1. Here -1 is impossible as that would mean that the read don't exist
        
        self.N=False #tag for the child specific snps
        self.category = "UNK" #base category, should only occur when no snp that are not child specific are in the read 

        self.tag = -1 # default is unassigned
        
[docs]    def add_snp(self, index, pre):
        '''Add a snp to the read
        
        Args:
             - index (int): the index of a snp
             - pre (int): presence of the snp at this position in the read
        '''
        self.local_snp.append((index, pre))

[docs]    def cross(self, al):
        '''
        Cross different fingerprints between themselves 
        
        Args:
             - al (dict): dictionary (with organism(s) name(s) as key) of booleans lists
            
        '''
        o1 = list(al.keys())[0]
        o2 = list(al.keys())[1]
        
        al[o1+'+'+o2] = [al[o1][i] or al[o2][i] for i in range(len(al[o1]))] #creation of the cross
        al.pop(o1)#removal of the parents
        al.pop(o2)
        
        if len(al) == 1: #if all the cross have been made
            return al
        #else do it again
        return self.cross(al) 


        
[docs]    def categorize(self, fprints, lsnp):
        '''
        Compare each fingerprint to its own and assign a category to the read.
        The finger prints are in fact list of snp index.
        
        Args:
             - fprints (dict): dictionnary (key is organism name) of list of list of tuple (snp index,presence) for each allele of each parent between start and stop
             - lsnp (list): the list of all snps
             
        
        Note:
             - Fingerprints don't contain masked snps
        
        Category notation:
             - If the read contain at least one snp belonging only to the child, a tag N is activated
             - Belonging to any specific parental category is stocked as a string identifier
            
        '''
        #number of fingerprint is sum(ploidy of parents)
        #print self.local_snp,fprints
        
        #we will now create a simple boolean list for each possible allele and stock it in a dictionnary
        #The child booleans will be in a simple list

        Ch = list()
        parents = dict() #key of this dict will be <organism name>(_<allele number>)if multiple alleles
        for name, org in list(fprints.items()): #for each parent organism
            if len(org)==1: #haploid
                parents[name]=list()
                
                continue
            
            for nb, allele in enumerate(org): #polyploid
                parents[name+'_'+str(nb)]=list()


        #print self.local_snp
        for i, tup in enumerate(self.local_snp):
            snp_index = tup[0]
            presence= tup[1]
            for k, v in list(lsnp[snp_index].presence.items()):#for each organisme_name, presence (that is a list)
                if k!=self.organism and presence in v: #At least one parent has the same presence as the child for this snp
                    
                    if presence==1: #the child has this snp
                        Ch.append(True)
                    else:
                        Ch.append(False)
                    
                    for name, org in list(fprints.items()): #for each parent organism
                        #print i, org[0]
                        if len(org)==1:#haploid
                            if org[0][i][1] ==1:
                                parents[name].append(True)
                            else:
                                parents[name].append(False)
                            continue 
                        for nb, allele in enumerate(org): #more than one allele for the parent
                            if allele[i][1] ==1:
                                parents[name+'_'+str(nb)].append(True)
                            else:
                                parents[name+'_'+str(nb)].append(False)                            
                    break #onto the next snp
        
        if len(Ch) < len(self.local_snp): #we have removed some new SNPs
            self.N = True
        if len(Ch)==0:
            return #after removing the child specific snps, there is none left: categorization is impossible, category is left UNK
        
    #print self.start,self.stop
        #print self.local_snp,fprints
        #print Ch,parents        
        #we now have a set of boolean lists that we can compare
        
        align = dict() #dictionnary of the alignment of each allele versus the child
        
        for name, allele in list(parents.items()):
            align[name] = [allele[i]==Ch[i] for i in range(len(Ch))] #simple equivalence check for each position
        
        match = list()#this is the list containing the parent (or parental association) that have a perfect match with the child
        #because we removed all child specific position, we expect to have a perfect match
        
        for name, al in list(align.items()):
            if sum(al) == len(Ch):
                match.append(name)
                

        #no need to do the last cross
        nb_parent = 2 #number of parent of the read
        
        while len(match)== 0 and nb_parent<len(align):
            for comb in itertools.combinations(list(align.items()), nb_parent):#for each combination of alleles:
                comb = dict((k, v) for (k, v) in comb)#{x[0] : x[1] for x in comb} #we take comb back to a dict
                cr = self.cross(comb) #we make the cross between parents
                if sum(cr[list(cr.keys())[0]]) == len(Ch): # and check for perfect matches
                    match.append(list(cr.keys())[0]) #we add the tag of this cross to the perfect matches list
            nb_parent += 1
        
        if len(match)==0:
            # We are in the case when all alleles contributed to the read
            match.append("+".join(list(align.keys())))
            
            #print to make sure:
            #print self.start, Ch, align
        
        #we now have the list of the cross displaying perfect matches and implying the fewer alleles
        self.category =  '('+')|('.join(match)+')'
        
    #print self.category

        return
    
    def __str__(self):
        return '\t'.join([self.gene,
                            self.organism,
                            self.sample,
                            str(self.start),
                            str(self.stop),
                            self.category,
                            str(self.N)])
    
    def __repr__(self):
        return self.__str__()