Source code for hylite.Protocol_Reader

#!/usr/bin/env python3

#    (c) Copyright 2013-2018 Murray Cox, Wandrille Duchemin, Pierre-Yves Dupont.
#
#
#    This file is part of HyLiTE.
#
#    HyLiTE is a free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License version 3 as published by
#    the Free Software Foundation.
#
#    HyLiTE is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with HyLiTE.  If not, see <http://www.gnu.org/licenses/>


#===============================#
# author: Wandrille Duchemin    #
#         Murray Cox            #
# last modified: 20 April 2018  #
#===============================#


import os, sys
from .Parameters import Parameters
from .Organism import Organism

[docs]class Protocol_Reader: ''' Class designed to read a file containing information about the organisms, samples and files of the HyLiTE analysis Attributes: - protocolefile (str): name of the file containing the protocol - handle (file object): reading handle of the protocol file ''' def __init__(self, filename): ''' Args: - filename (str): name of the file containing the protocol informations ''' self.protocolfile = filename self.handle = open(self.protocolfile, 'r') return
[docs] def close(self): ''' Close the handle ''' self.handle.close() return
[docs] def read(self, sam): ''' Read the protocol file Args: - sam (bool): a boolean set to True if the protocol file contains .sam file and not reads file Returns: - list. the lis of the organism included in the HyLiTE analysis FORMAT of the file: - without header - separated by '\t' - one line per sample per organism - SAMPLE_TYPE is 'RNAseq' or 'gDNA' (if other, DEFAULT_SAMPLE_TYPE will be put instead) - THE CHILD MUST BE THE FIRST ORGANISM ORGANISM_NAME PLOIDY SAMPLE_NAME SAMPLE_TYPE READ_FILE.fastq/ALIGNMENT_FILE.sam ''' lorgname = list()#list of organism name dploidy = dict()#key: organism name ; value: ploidy ddsample = dict()#key: organism name ; value: dict: key: sample name ; value: sample type ddreadfile = dict()#key: organism name ; value: dict: key: sample name ; value: list of read files associated with this sample if sam == True: #we have the sam alignment file ddsamfile = dict() for l in self.handle: #for each line if not l: continue ll = l.strip('\n').split('\t') if not ll: continue #if not len(ll) == 5: continue if not len(ll) == 5: print( "Cannot parse protocol line '{}'\n Line should contain five tab-delimited columns".format(l)) sys.exit(1) #print 'line',ll if ll[0] not in lorgname: #first time we see this organism lorgname.append(ll[0]) dploidy[ll[0]] = int(ll[1])#two first column ddsample[ll[0]] = dict() ddreadfile[ll[0]] = dict() if sam: ddsamfile[ll[0]] = dict() #third column: sample name if (ll[2] in ddsample[ll[0]]) is False: #first time we see this sample name for this organism if ll[3] in Parameters().get_param('SAMPLE_TYPE'): ddsample[ll[0]][ll[2]] = ll[3] #we add the type of the sample else: ddsample[ll[0]][ll[2]] = Parameters().get_param('DEFAULT_SAMPLE_TYPE') ddreadfile[ll[0]][ll[2]]=list() #verification of the files current_file = ll[4] if not os.path.isfile(current_file): raise IOError("%s couldn't be found\n" % current_file) if sam and not(current_file.endswith(".sam")): sys.stderr.write("WARNING: Sam files option turned on in the command line, but file extension is: %s\n" % os.path.splitext(current_file)[-1]) if not sam and not(current_file.endswith(".fastq")): sys.stderr.write("WARNING: Fastq file expected, but file extension is: %s\n" % os.path.splitext(current_file)[-1]) if not sam: ddreadfile[ll[0]][ll[2]].append(ll[4]) #we add the name of the .fastq file else: ddsamfile[ll[0]][ll[2]] = ll[4] ddreadfile[ll[0]][ll[2]].append(None) #we can now create our Organisms instances lorg = list() for name in lorgname: #for each organism lorg.append(Organism(name, dploidy[name], ddreadfile[name], list(ddsample[name].keys()), ddsample[name])) if sam: #we already have the .sam files lorg[-1].dsamfile = ddsamfile[name] #print lorg[-1].dsamfile return lorg