#!/usr/bin/env python3
# (c) Copyright 2013-2018 Murray Cox, Wandrille Duchemin, Pierre-Yves Dupont.
#
#
# This file is part of HyLiTE.
#
# HyLiTE is a free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3 as published by
# the Free Software Foundation.
#
# HyLiTE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HyLiTE. If not, see <http://www.gnu.org/licenses/>
#===============================#
# author: Wandrille Duchemin #
# Murray Cox #
# last modified: 9 January 2018 #
#===============================#
import itertools
[docs]class Read:
'''
Used to represent a read.
Attributes:
- organism (str): name of the organism the read belongs to
- sample (str): name of the sample the Read belong to
- gene (str): gene id
- start (int): starting position of the read on the gene
- stop (int): stopping position of the read on the gene
- lsnp (list): references to the snps contained in the read
- N (bool): tag for the presence of child specific snps
- category (str): give the parents of the reads.
'''
def __init__(self, organism, sample, gene, start):
'''Used to represent a read.
Args:
- organism (str): name of the organism
- sample (str): name of the sample
- gene (str): name of the gene corresponding to the read
- start (int): starting position of the read
- tag (int): indicate if the read already has a gene copy assigned. -1 for not assigned, corresponding index otherwise
'''
self.organism=organism
self.sample = sample
self.gene=gene
self.start=start
self.stop=None
self.local_snp=[] #list of tuple (snp_index, presence) presence is:-1/0/1. Here -1 is impossible as that would mean that the read don't exist
self.N=False #tag for the child specific snps
self.category = "UNK" #base category, should only occur when no snp that are not child specific are in the read
self.tag = -1 # default is unassigned
[docs] def add_snp(self, index, pre):
'''Add a snp to the read
Args:
- index (int): the index of a snp
- pre (int): presence of the snp at this position in the read
'''
self.local_snp.append((index, pre))
[docs] def cross(self, al):
'''
Cross different fingerprints between themselves
Args:
- al (dict): dictionary (with organism(s) name(s) as key) of booleans lists
'''
o1 = list(al.keys())[0]
o2 = list(al.keys())[1]
al[o1+'+'+o2] = [al[o1][i] or al[o2][i] for i in range(len(al[o1]))] #creation of the cross
al.pop(o1)#removal of the parents
al.pop(o2)
if len(al) == 1: #if all the cross have been made
return al
#else do it again
return self.cross(al)
[docs] def categorize(self, fprints, lsnp):
'''
Compare each fingerprint to its own and assign a category to the read.
The finger prints are in fact list of snp index.
Args:
- fprints (dict): dictionnary (key is organism name) of list of list of tuple (snp index,presence) for each allele of each parent between start and stop
- lsnp (list): the list of all snps
Note:
- Fingerprints don't contain masked snps
Category notation:
- If the read contain at least one snp belonging only to the child, a tag N is activated
- Belonging to any specific parental category is stocked as a string identifier
'''
#number of fingerprint is sum(ploidy of parents)
#print self.local_snp,fprints
#we will now create a simple boolean list for each possible allele and stock it in a dictionnary
#The child booleans will be in a simple list
Ch = list()
parents = dict() #key of this dict will be <organism name>(_<allele number>)if multiple alleles
for name, org in list(fprints.items()): #for each parent organism
if len(org)==1: #haploid
parents[name]=list()
continue
for nb, allele in enumerate(org): #polyploid
parents[name+'_'+str(nb)]=list()
#print self.local_snp
for i, tup in enumerate(self.local_snp):
snp_index = tup[0]
presence= tup[1]
for k, v in list(lsnp[snp_index].presence.items()):#for each organisme_name, presence (that is a list)
if k!=self.organism and presence in v: #At least one parent has the same presence as the child for this snp
if presence==1: #the child has this snp
Ch.append(True)
else:
Ch.append(False)
for name, org in list(fprints.items()): #for each parent organism
#print i, org[0]
if len(org)==1:#haploid
if org[0][i][1] ==1:
parents[name].append(True)
else:
parents[name].append(False)
continue
for nb, allele in enumerate(org): #more than one allele for the parent
if allele[i][1] ==1:
parents[name+'_'+str(nb)].append(True)
else:
parents[name+'_'+str(nb)].append(False)
break #onto the next snp
if len(Ch) < len(self.local_snp): #we have removed some new SNPs
self.N = True
if len(Ch)==0:
return #after removing the child specific snps, there is none left: categorization is impossible, category is left UNK
#print self.start,self.stop
#print self.local_snp,fprints
#print Ch,parents
#we now have a set of boolean lists that we can compare
align = dict() #dictionnary of the alignment of each allele versus the child
for name, allele in list(parents.items()):
align[name] = [allele[i]==Ch[i] for i in range(len(Ch))] #simple equivalence check for each position
match = list()#this is the list containing the parent (or parental association) that have a perfect match with the child
#because we removed all child specific position, we expect to have a perfect match
for name, al in list(align.items()):
if sum(al) == len(Ch):
match.append(name)
#no need to do the last cross
nb_parent = 2 #number of parent of the read
while len(match)== 0 and nb_parent<len(align):
for comb in itertools.combinations(list(align.items()), nb_parent):#for each combination of alleles:
comb = dict((k, v) for (k, v) in comb)#{x[0] : x[1] for x in comb} #we take comb back to a dict
cr = self.cross(comb) #we make the cross between parents
if sum(cr[list(cr.keys())[0]]) == len(Ch): # and check for perfect matches
match.append(list(cr.keys())[0]) #we add the tag of this cross to the perfect matches list
nb_parent += 1
if len(match)==0:
# We are in the case when all alleles contributed to the read
match.append("+".join(list(align.keys())))
#print to make sure:
#print self.start, Ch, align
#we now have the list of the cross displaying perfect matches and implying the fewer alleles
self.category = '('+')|('.join(match)+')'
#print self.category
return
def __str__(self):
return '\t'.join([self.gene,
self.organism,
self.sample,
str(self.start),
str(self.stop),
self.category,
str(self.N)])
def __repr__(self):
return self.__str__()