# Copyright 2013 Mitchell Stanton-Cook Licensed under the
# Educational Community License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may
# obtain a copy of the License at
#
# http://www.osedu.org/licenses/ECL-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS"
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing
# permissions and limitations under the License.
import sys
import os
from Bio import SeqIO
from BanzaiDB import parsers
#def bring_CDS_to_front(line):
# """
#
# """
# for e in feat_list:
# if e[]
[docs]def nway_reportify(nway_any_file):
"""
Convert a nway.any to something similar to report.txt
This converts the nway.any which contains richer information (i.e. N
calls) into something similar to report.txt
TODO: Add a simple example of input vs output of this method.
ref_id, position, strains, ref_base, v_class, changes, evidence,
consequences
:param nway_any_file: full path as a string the the file nway.any file
:type nway_any_file: string
:returns: a list of tuples. Each list element refers to a variant
position while the tuple contains the states of each strain
"""
parsed = []
nway_any_file = os.path.expanduser(nway_any_file)
if not os.path.isfile(nway_any_file):
print "Please specify a valid Nesoni n-way (any) SNP comparison file"
sys.exit(1)
else:
with open(nway_any_file, 'r') as f:
strains = f.readline().strip().split()[5:-1]
num_strains = len(strains)/3
strains = strains[:num_strains]
for line in f:
uncalled = False
cur = line.split("\t")
ref_id, position, v_class, ref_base = cur[0], int(cur[1]), cur[2], cur[3]
changes = cur[4:num_strains+4]
if 'N' in changes:
uncalled = True
evidence = cur[num_strains+4:(2*(num_strains))+4]
consequences = cur[(2*(num_strains))+4:-1]
# Something is broken if not true -
assert len(strains) == len(changes) == len(evidence)
results = zip([ref_id]*num_strains, [position]*num_strains, strains,
[ref_base]*num_strains, [v_class]*num_strains,
changes, evidence, consequences,
[uncalled]*num_strains)
parsed.append(results)
return parsed
[docs]def nesoni_report_to_JSON(reportified):
"""
Convert a nesoni nway.any file that has been reportified to JSON
See: tables.rst for info on what is stored in RethinkDB
:param reportified: the reportified nway.any file (been through
nway_reportify()). This is essentially a list of tuples
:returns: a list of JSON
"""
stats = {}
parsed_list = []
for position in reportified:
for elem in position:
skip = False
ref_id, pos, strain, old, ftype, new, evidence, cons, uncalled = elem
ref_id = '.'.join(ref_id.split('.')[:-1])
# Initialise the stats...
if strain not in stats:
stats[strain] = 0
if new == old:
# Have no change
#dat = ["conserved"]+[None]*9
skip = True
elif new == 'N':
# Have an uncalled base
#dat = ["uncalled"]+[None]*9
skip = True
# Check for mixtures...
elif ftype == "substitution" and new.find('-') != -1:
# Deletion hidden in substitution
ftype = 'deletion'
dat = extract_consequences(cons, ftype)
stats[strain] = stats[strain]+1
elif ftype == "substitution" and len(new) > 1:
# Insertion hidden in substitution
ftype = 'insertion'
dat = extract_consequences(cons, ftype)
stats[strain] = stats[strain]+1
elif ftype == "deletion" and new.find('-') == -1 and len(new) == 1:
# Substitution hidden in deletions
ftype = 'substitution'
dat = extract_consequences(cons, ftype)
stats[strain] = stats[strain]+1
elif ftype == "deletion" and new.find('-') == -1 and len(new) > 1:
# Insertion hidden in deletions
ftype = 'insertion'
dat = extract_consequences(cons, ftype)
stats[strain] = stats[strain]+1
elif ftype == "insertion" and new.find('-') != -1:
# Deletion hidden in insertions
ftype = 'deletion'
dat = extract_consequences(cons, ftype)
stats[strain] = stats[strain]+1
elif ftype == "insertion" and new.find('-') == -1 and len(new) == 1:
# Substitution hidden in insertions
ftype = 'substitution'
dat = extract_consequences(cons, ftype)
stats[strain] = stats[strain]+1
# We have the same change state across all strains
else:
dat = extract_consequences(cons, ftype)
stats[strain] = stats[strain]+1
obs_count = parsers.parse_evidence(evidence)
# Some simple tests
the_classes = ['insertion', 'deletion', 'substitution']
if not skip:
assert dat[0] in the_classes
json = {"id": strain+'_'+ref_id+'_'+str(pos),
"StrainID": strain,
"Position": pos,
"LocusTag": dat[2],
"Class": dat[0],
"SubClass": dat[1],
"RefBase": old,
"ChangeBase": new,
"CDSBaseNum": dat[3],
"CDSAANum": dat[4],
"CDSRegion": dat[5],
"RefAA": dat[6],
"ChangeAA": dat[7],
"Product": dat[8],
"CorrelatedChange": dat[9],
"Evidence": obs_count,
"UncalledBlock": uncalled
}
parsed_list.append(json)
return parsed_list, stats
[docs]def reference_genome_features_to_JSON(genome_file):
"""
From genome reference (GBK format) convert CDS, gene & RNA features to JSON
The following 2 are really good resources:
* http://www.ncbi.nlm.nih.gov/books/NBK63592/
* http://www.ncbi.nlm.nih.gov/genbank/genomesubmit_annotation
.. note:: also see tables.rst for detailed description of the JSON
schema
.. warning:: do not think that this handles misc_features
:param genome_file: the fullpath as a string to the genbank file
:returns: a JSON representing the the reference and a list of JSON
containing information on the features
"""
misc_set = ['tRNA', 'rRNA', 'tmRNA', 'ncRNA']
with open(genome_file) as fin:
genome = SeqIO.read(fin, "genbank")
gd, gn, gid = genome.description, genome.name, genome.id
print "Adding %s into the RethinkDB instance" % (gd)
JSON_r = {'revision': int(gid.split('.')[-1]),
'reference_name': gd,
'id': gn}
parsed_list = []
for feat in genome.features:
start = int(feat.location.start.position)
JSON_f = {'sequence': str(feat.extract(genome.seq)),
'start': start,
'end': int(feat.location.end.position),
'strand': int(feat.strand),
'reference_id': gid,
'product': None,
'translation': None,
'locus_tag': None}
# Handle CDS, gene, tRNA & rRNA features
# Do CDS
if feat.type == 'CDS':
locus_tag = feat.qualifiers['locus_tag'][0]
JSON_f['id'] = gid+"_"+locus_tag+"_CDS"
JSON_f['locus_tag'] = locus_tag
if 'pseudo' not in feat.qualifiers:
JSON_f['translation'] = feat.qualifiers['translation'][0]
JSON_f['product'] = feat.qualifiers['product'][0]
else:
JSON_f['product'] = 'pseudo'
parsed_list.append(JSON_f)
# Do gene
elif feat.type == 'gene':
locus_tag = feat.qualifiers['locus_tag'][0]
JSON_f['id'] = gid+"_"+locus_tag+"_gene"
if 'pseudo' not in feat.qualifiers:
try:
JSON_f['product'] = feat.qualifiers['gene'][0]
except:
pass
else:
JSON_f['product'] = 'pseudo'
parsed_list.append(JSON_f)
# Do other (*RNA)
elif feat.type in misc_set:
try:
JSON_f['product'] = feat.qualifiers['product'][0]
except KeyError:
JSON_f['product'] = None
JSON_f['id'] = gid+"_"+str(JSON_f['start'])+"-"+str(JSON_f['end'])
parsed_list.append(JSON_f)
else:
print "Skipped feature at %i to %i " % (JSON_f['start'],
JSON_f['end'])
return JSON_r, parsed_list