Source code for hylite.Protocol_Reader
#!/usr/bin/env python3
# (c) Copyright 2013-2018 Murray Cox, Wandrille Duchemin, Pierre-Yves Dupont.
#
#
# This file is part of HyLiTE.
#
# HyLiTE is a free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3 as published by
# the Free Software Foundation.
#
# HyLiTE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HyLiTE. If not, see <http://www.gnu.org/licenses/>
#===============================#
# author: Wandrille Duchemin #
# Murray Cox #
# last modified: 20 April 2018 #
#===============================#
import os, sys
from .Parameters import Parameters
from .Organism import Organism
[docs]class Protocol_Reader:
'''
Class designed to read a file containing information about the organisms, samples and files of the HyLiTE analysis
Attributes:
- protocolefile (str): name of the file containing the protocol
- handle (file object): reading handle of the protocol file
'''
def __init__(self, filename):
'''
Args:
- filename (str): name of the file containing the protocol informations
'''
self.protocolfile = filename
self.handle = open(self.protocolfile, 'r')
return
[docs] def close(self):
'''
Close the handle
'''
self.handle.close()
return
[docs] def read(self, sam):
'''
Read the protocol file
Args:
- sam (bool): a boolean set to True if the protocol file contains .sam file and not reads file
Returns:
- list. the lis of the organism included in the HyLiTE analysis
FORMAT of the file:
- without header
- separated by '\t'
- one line per sample per organism
- SAMPLE_TYPE is 'RNAseq' or 'gDNA' (if other, DEFAULT_SAMPLE_TYPE will be put instead)
- THE CHILD MUST BE THE FIRST ORGANISM
ORGANISM_NAME PLOIDY SAMPLE_NAME SAMPLE_TYPE READ_FILE.fastq/ALIGNMENT_FILE.sam
'''
lorgname = list()#list of organism name
dploidy = dict()#key: organism name ; value: ploidy
ddsample = dict()#key: organism name ; value: dict: key: sample name ; value: sample type
ddreadfile = dict()#key: organism name ; value: dict: key: sample name ; value: list of read files associated with this sample
if sam == True: #we have the sam alignment file
ddsamfile = dict()
for l in self.handle: #for each line
if not l: continue
ll = l.strip('\n').split('\t')
if not ll: continue
#if not len(ll) == 5: continue
if not len(ll) == 5:
print( "Cannot parse protocol line '{}'\n Line should contain five tab-delimited columns".format(l))
sys.exit(1)
#print 'line',ll
if ll[0] not in lorgname: #first time we see this organism
lorgname.append(ll[0])
dploidy[ll[0]] = int(ll[1])#two first column
ddsample[ll[0]] = dict()
ddreadfile[ll[0]] = dict()
if sam:
ddsamfile[ll[0]] = dict()
#third column: sample name
if (ll[2] in ddsample[ll[0]]) is False: #first time we see this sample name for this organism
if ll[3] in Parameters().get_param('SAMPLE_TYPE'):
ddsample[ll[0]][ll[2]] = ll[3] #we add the type of the sample
else:
ddsample[ll[0]][ll[2]] = Parameters().get_param('DEFAULT_SAMPLE_TYPE')
ddreadfile[ll[0]][ll[2]]=list()
#verification of the files
current_file = ll[4]
if not os.path.isfile(current_file): raise IOError("%s couldn't be found\n" % current_file)
if sam and not(current_file.endswith(".sam")):
sys.stderr.write("WARNING: Sam files option turned on in the command line, but file extension is: %s\n" % os.path.splitext(current_file)[-1])
if not sam and not(current_file.endswith(".fastq")):
sys.stderr.write("WARNING: Fastq file expected, but file extension is: %s\n" % os.path.splitext(current_file)[-1])
if not sam:
ddreadfile[ll[0]][ll[2]].append(ll[4]) #we add the name of the .fastq file
else:
ddsamfile[ll[0]][ll[2]] = ll[4]
ddreadfile[ll[0]][ll[2]].append(None)
#we can now create our Organisms instances
lorg = list()
for name in lorgname: #for each organism
lorg.append(Organism(name, dploidy[name], ddreadfile[name], list(ddsample[name].keys()), ddsample[name]))
if sam: #we already have the .sam files
lorg[-1].dsamfile = ddsamfile[name]
#print lorg[-1].dsamfile
return lorg