Source code for hylite.Protocol_Reader

#!/usr/bin/env python3

#    (c) Copyright 2013-2018 Murray Cox, Wandrille Duchemin, Pierre-Yves Dupont.
#
#
#    This file is part of HyLiTE.
#
#    HyLiTE is a free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License version 3 as published by
#    the Free Software Foundation.
#
#    HyLiTE is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with HyLiTE.  If not, see <http://www.gnu.org/licenses/>


#===============================#
# author: Wandrille Duchemin    #
#         Murray Cox            #
# last modified: 20 April 2018  #
#===============================#


import os, sys
from .Parameters import Parameters
from .Organism import Organism

[docs]class Protocol_Reader:
    '''
    Class designed to read a file containing information about the organisms, samples and files of the HyLiTE analysis

    Attributes:
         - protocolefile (str): name of the file containing the protocol
         - handle (file object): reading handle of the protocol file
    '''
    def __init__(self, filename):
        '''
        Args:
             - filename (str): name of the file containing the protocol informations
        '''
        self.protocolfile = filename
        self.handle = open(self.protocolfile, 'r')
        return

[docs]    def close(self):
        '''
        Close the handle
        '''
        self.handle.close()
        return

[docs]    def read(self, sam):
        '''
        Read the protocol file

        Args:
             - sam (bool): a boolean set to True if the protocol file contains .sam file and not reads file

        Returns:
             - list. the lis of the organism included in the HyLiTE analysis

        FORMAT of the file:
         - without header
         - separated by '\t'
         - one line per sample per organism
         - SAMPLE_TYPE is 'RNAseq' or 'gDNA' (if other,  DEFAULT_SAMPLE_TYPE will be put instead)
         - THE CHILD MUST BE THE FIRST ORGANISM

        ORGANISM_NAME    PLOIDY    SAMPLE_NAME    SAMPLE_TYPE    READ_FILE.fastq/ALIGNMENT_FILE.sam
        '''
        lorgname = list()#list of organism name
        dploidy = dict()#key: organism name ; value: ploidy
        ddsample = dict()#key: organism name ; value: dict: key: sample name ; value: sample type
        ddreadfile = dict()#key: organism name ; value: dict: key: sample name ; value: list of read files associated with this sample

        if sam == True: #we have the sam alignment file
            ddsamfile = dict()

        for l in self.handle: #for each line
            if not l: continue
            ll = l.strip('\n').split('\t')
            if not ll: continue
            #if not len(ll) == 5: continue
            if not len(ll) == 5:
                print( "Cannot parse protocol line '{}'\n Line should contain five tab-delimited columns".format(l))
                sys.exit(1)
            #print 'line',ll

            if ll[0] not in lorgname: #first time we see this organism
                lorgname.append(ll[0])
                dploidy[ll[0]] = int(ll[1])#two first column
                ddsample[ll[0]] = dict()
                ddreadfile[ll[0]] = dict()
                if sam:
                    ddsamfile[ll[0]] = dict()

            #third column: sample name
            if (ll[2] in ddsample[ll[0]]) is False: #first time we see this sample name for this organism
                if ll[3] in Parameters().get_param('SAMPLE_TYPE'):
                    ddsample[ll[0]][ll[2]] = ll[3] #we add the type of the sample
                else:
                    ddsample[ll[0]][ll[2]] = Parameters().get_param('DEFAULT_SAMPLE_TYPE')
                ddreadfile[ll[0]][ll[2]]=list()

            #verification of the files
            current_file = ll[4]
            if not os.path.isfile(current_file): raise IOError("%s couldn't be found\n" % current_file)
            if sam and not(current_file.endswith(".sam")):
                sys.stderr.write("WARNING: Sam files option turned on in the command line, but file extension is: %s\n" % os.path.splitext(current_file)[-1])
            if not sam and not(current_file.endswith(".fastq")):
                sys.stderr.write("WARNING: Fastq file expected, but file extension is: %s\n" % os.path.splitext(current_file)[-1])
                           

            if not sam:
                ddreadfile[ll[0]][ll[2]].append(ll[4]) #we add the name of the .fastq file
            else:
                ddsamfile[ll[0]][ll[2]] = ll[4]
                ddreadfile[ll[0]][ll[2]].append(None)

        #we can now create our Organisms instances
        lorg = list()
        for name in lorgname: #for each organism
            lorg.append(Organism(name, dploidy[name], ddreadfile[name], list(ddsample[name].keys()), ddsample[name]))

            if sam: #we already have the .sam files
                lorg[-1].dsamfile = ddsamfile[name]
                #print lorg[-1].dsamfile
        return lorg