Source code for hylite.Read

#!/usr/bin/env python3

#    (c) Copyright 2013-2018 Murray Cox, Wandrille Duchemin, Pierre-Yves Dupont.
#
#
#    This file is part of HyLiTE.
#
#    HyLiTE is a free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License version 3 as published by
#    the Free Software Foundation.
#
#    HyLiTE is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with HyLiTE.  If not, see <http://www.gnu.org/licenses/>


#===============================#
# author: Wandrille Duchemin    #
#         Murray Cox            #
# last modified: 9 January 2018 #
#===============================#


import itertools

[docs]class Read: ''' Used to represent a read. Attributes: - organism (str): name of the organism the read belongs to - sample (str): name of the sample the Read belong to - gene (str): gene id - start (int): starting position of the read on the gene - stop (int): stopping position of the read on the gene - lsnp (list): references to the snps contained in the read - N (bool): tag for the presence of child specific snps - category (str): give the parents of the reads. ''' def __init__(self, organism, sample, gene, start): '''Used to represent a read. Args: - organism (str): name of the organism - sample (str): name of the sample - gene (str): name of the gene corresponding to the read - start (int): starting position of the read - tag (int): indicate if the read already has a gene copy assigned. -1 for not assigned, corresponding index otherwise ''' self.organism=organism self.sample = sample self.gene=gene self.start=start self.stop=None self.local_snp=[] #list of tuple (snp_index, presence) presence is:-1/0/1. Here -1 is impossible as that would mean that the read don't exist self.N=False #tag for the child specific snps self.category = "UNK" #base category, should only occur when no snp that are not child specific are in the read self.tag = -1 # default is unassigned
[docs] def add_snp(self, index, pre): '''Add a snp to the read Args: - index (int): the index of a snp - pre (int): presence of the snp at this position in the read ''' self.local_snp.append((index, pre))
[docs] def cross(self, al): ''' Cross different fingerprints between themselves Args: - al (dict): dictionary (with organism(s) name(s) as key) of booleans lists ''' o1 = list(al.keys())[0] o2 = list(al.keys())[1] al[o1+'+'+o2] = [al[o1][i] or al[o2][i] for i in range(len(al[o1]))] #creation of the cross al.pop(o1)#removal of the parents al.pop(o2) if len(al) == 1: #if all the cross have been made return al #else do it again return self.cross(al)
[docs] def categorize(self, fprints, lsnp): ''' Compare each fingerprint to its own and assign a category to the read. The finger prints are in fact list of snp index. Args: - fprints (dict): dictionnary (key is organism name) of list of list of tuple (snp index,presence) for each allele of each parent between start and stop - lsnp (list): the list of all snps Note: - Fingerprints don't contain masked snps Category notation: - If the read contain at least one snp belonging only to the child, a tag N is activated - Belonging to any specific parental category is stocked as a string identifier ''' #number of fingerprint is sum(ploidy of parents) #print self.local_snp,fprints #we will now create a simple boolean list for each possible allele and stock it in a dictionnary #The child booleans will be in a simple list Ch = list() parents = dict() #key of this dict will be <organism name>(_<allele number>)if multiple alleles for name, org in list(fprints.items()): #for each parent organism if len(org)==1: #haploid parents[name]=list() continue for nb, allele in enumerate(org): #polyploid parents[name+'_'+str(nb)]=list() #print self.local_snp for i, tup in enumerate(self.local_snp): snp_index = tup[0] presence= tup[1] for k, v in list(lsnp[snp_index].presence.items()):#for each organisme_name, presence (that is a list) if k!=self.organism and presence in v: #At least one parent has the same presence as the child for this snp if presence==1: #the child has this snp Ch.append(True) else: Ch.append(False) for name, org in list(fprints.items()): #for each parent organism #print i, org[0] if len(org)==1:#haploid if org[0][i][1] ==1: parents[name].append(True) else: parents[name].append(False) continue for nb, allele in enumerate(org): #more than one allele for the parent if allele[i][1] ==1: parents[name+'_'+str(nb)].append(True) else: parents[name+'_'+str(nb)].append(False) break #onto the next snp if len(Ch) < len(self.local_snp): #we have removed some new SNPs self.N = True if len(Ch)==0: return #after removing the child specific snps, there is none left: categorization is impossible, category is left UNK #print self.start,self.stop #print self.local_snp,fprints #print Ch,parents #we now have a set of boolean lists that we can compare align = dict() #dictionnary of the alignment of each allele versus the child for name, allele in list(parents.items()): align[name] = [allele[i]==Ch[i] for i in range(len(Ch))] #simple equivalence check for each position match = list()#this is the list containing the parent (or parental association) that have a perfect match with the child #because we removed all child specific position, we expect to have a perfect match for name, al in list(align.items()): if sum(al) == len(Ch): match.append(name) #no need to do the last cross nb_parent = 2 #number of parent of the read while len(match)== 0 and nb_parent<len(align): for comb in itertools.combinations(list(align.items()), nb_parent):#for each combination of alleles: comb = dict((k, v) for (k, v) in comb)#{x[0] : x[1] for x in comb} #we take comb back to a dict cr = self.cross(comb) #we make the cross between parents if sum(cr[list(cr.keys())[0]]) == len(Ch): # and check for perfect matches match.append(list(cr.keys())[0]) #we add the tag of this cross to the perfect matches list nb_parent += 1 if len(match)==0: # We are in the case when all alleles contributed to the read match.append("+".join(list(align.keys()))) #print to make sure: #print self.start, Ch, align #we now have the list of the cross displaying perfect matches and implying the fewer alleles self.category = '('+')|('.join(match)+')' #print self.category return
def __str__(self): return '\t'.join([self.gene, self.organism, self.sample, str(self.start), str(self.stop), self.category, str(self.N)]) def __repr__(self): return self.__str__()