Source code for hylite.Organism

#!/usr/bin/env python3

#    (c) Copyright 2013-2018 Murray Cox, Wandrille Duchemin, Pierre-Yves Dupont.
#
#
#    This file is part of HyLiTE.
#
#    HyLiTE is a free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License version 3 as published by
#    the Free Software Foundation.
#
#    HyLiTE is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with HyLiTE.  If not, see <http://www.gnu.org/licenses/>


#===============================#
# author: Wandrille Duchemin    #
#         Murray Cox            #
# last modified: 9 January 2018 #
#===============================#


from .Fingerprint import Fingerprint

[docs]class Organism:
    '''
    This class represent an organism (parent or child) in our analysis
    
    Its attributes are:
     - name : str, name of the organism
     - ploidy : int, ploidy of the organism
     - dreadfile : dict, key are sample name, value is the list of the names of the files containing the reads
     - sample_name: list, list of the samples name
     - sample_type: dictionnary, key are sample name and value are 'RNAseq' or 'gDNA' corresponding to the nature of the sample
     - dsamfile : str, name of the resulting .sam file
     - dexpression: dict, key are sample name, value are dict where key are gene label, value is number of reads for the gene
     - fingerprint: Fingerprint
    '''
    def __init__(self, name, ploidy, dreadfile, sample_name, sample_type):
        '''This class is used to represent an organism during a HyLiTE analysis
        
        Args:
             - name (str): the name of the organism
             - ploidy (int): the ploidy of the organism
             - dreadfile (dict): a dictionnary of the files containing reads from the organism (one file per sample) (key is the sample name, value is the filename)
             - sample_name (list): a list of the samples name (the order of the names are relevant)
             - sample_type (dict): a dictionnary where key are sample name and value are 'RNAseq' or 'gDNA' corresponding to the nature of the sample
         
        '''
        
        self.name=name
        self.ploidy=ploidy
        self.dreadfile=dreadfile
        self.sample_name = sample_name
        self.sample_type = sample_type
        
        self.dsamfile=dict((x, None) for x in self.sample_name)
        self.dbamfile=dict((x, None) for x in self.sample_name) #python 2.7: {x : None for x in self.sample_name}
        self.dsortedbamfile=dict((x, None) for x in self.sample_name) #{x : None for x in self.sample_name}

        self.dexpression =dict((x, dict()) for x in self.sample_name) #{x : dict() for x in self.sample_name} # dict of dict first key is sample name, second key is gene name, value is count of said gene for said sample
        self.fingerprint = Fingerprint(self.ploidy) #only one fingerprint by organism
        return

    def __str__(self):
        return "Organism: %s" % self.name

    def __repr__(self):
        return self.__str__()

[docs]    def add_expression(self, sample, gene, exp):
        '''Increment the count of a given gene from a given sample
        
        Args:
             - sample (str): a sample name
             - gene (str): a gene name
             - exp (int): the number of count
        
        '''
        self.dexpression[sample][gene] = self.dexpression[sample].get(gene, 0) +exp #increment of exp if the gene already exist, put exp otherwise
        return
    
[docs]    def add_snp(self,gene,position,snp_index,presence,polyploid_args=None):
        '''
        Args:
             - gene (str): a gene name
             - position (int): the position of the snp on the gene
             - snp_index (int): index of the snp in the hylite list
             - presence (list): list of int; give its presence (-1/0/1) in each gene copy
        Kwargs:
             - polyploid_args: optional argument for polyploids #actually not used... but whatever, let's keep it
        '''
        for i in range(self.ploidy):
            self.fingerprint.add_snp(gene, position, snp_index, presence[i], i)

        
[docs]    def get_fingerprint(self, gene, start, stop, allele):
        '''
        Returns:
             - list. a list of the list of the snp index that are found between start and stop on gene in one allele of the organism
        '''
        
        fp = self.fingerprint.get_fingerprint(gene, start, stop, allele)
        
        return fp
    
[docs]    def get_all_fingerprint(self, gene, start, stop):
        '''
        Args:
             - gene (str): a gene
             - start (int): a start
             - stop (int): stop position on the gene
            
        Returns:
             - list. the list of all the fingerprints of the organism between start and stop
        '''
        fps = list()

        similar =True
        for i in range(self.ploidy):
            fps.append(self.get_fingerprint(gene, start, stop, i))
            if i>0:
                if fps[-1]!=fps[0]:
                    similar = False
        if similar is False:
            return fps
        #if all fingerprints are the same, return only one fingerprint
        return [fps[0]]

[docs]    def clear(self):
        '''
        This function clear all the gene related data of an organism
        '''
        del self.dexpression
        del self.fingerprint
        self.dexpression =dict((x, dict()) for x in self.sample_name) #{x : dict() for x in self.sample_name} # dict of dict first key is sample name, second key is gene name, value is count of said gene for said sample
        self.fingerprint = Fingerprint(self.ploidy) #only one fingerprint by organism