Source code for gptwosample.data.dataIO
"""
Data IO tool
============
For convienent usage this module provides IO operations for data
Created on Jun 9, 2011
@author: Max Zwiessele, Oliver Stegle
"""
import csv, scipy as SP, sys
import os
import numpy
from re import compile
[docs]def get_data_from_csv(path_to_file, delimiter=',', count= -1, verbose=True, message="Reading File", fil=None):
'''
Return data from csv file with delimiter delimiter in form of a dictionary.
Missing Values are all values x which cannot be converted float(x)
The file format has to fullfill following formation:
============ =============== ==== ===============
*arbitrary* x1 ... xl
============ =============== ==== ===============
Gene Name 1 y1 replicate 1 ... yl replicate 1
... ... ... ...
Gene Name 1 y1 replicate k1 ... yl replicate k1
...
Gene Name n y1 replicate 1 ... yl replicate 1
... ... ... ...
Gene Name n y1 replicate kn ... yl replicate kn
============ =============== ==== ===============
Returns: {"input":[x1,...,xl], "Gene Name 1":[[y1 replicate 1, ... yl replicate 1], ... ,[y1 replicate k, ..., yl replikate k]]}
'''
def filter_(x):
try:
return float(str.strip(x))
except:
return numpy.nan
end = float(count_lines(path_to_file))
if fil is not None:
fil = map(compile,fil)
filtered = []
def matchesin(name, fil):
for f in fil:
if f.match(name):
return True
return False
with open(path_to_file, "r") as out_file:
reader = csv.reader(out_file, delimiter=str(delimiter))
out = sys.stdout
current_line = 0
if verbose:
message = "{1:s} {0:s}".format(os.path.basename(path_to_file), message)
mess = lambda x: "{1:s}: {0:.2%}".format(x, message)
out.write(mess(0) + " \r")
data = {"input":map(filter_,reader.next()[1:])}
for line in reader:
if line:
gene_name = line[0]
if fil is not None:
if gene_name in filtered:
continue
if not matchesin(gene_name, fil):
filtered.append(gene_name)
continue
l_filtered = [filter_(x) for x in line[1:]]
if(data.has_key(gene_name)):
data[gene_name].append(l_filtered)
else:
data[gene_name] = [l_filtered]
current_line += 1
if verbose:
out.flush()
out.write(mess(current_line / end) + "\r")
# progress += 1
# step_ahead = int((1.*progress/end)*60.)
# if(step_ahead > step):
# out.write("#"*(step_ahead-step))
# step = step_ahead
out.flush()
if verbose:
try:
out.write(message + " " + '\033[92m' + u"\u2713" + '\033[0m' + ' \r')
except:
out.write(message + " done \r")
for name, expr in data.iteritems():
try:
data[name] = SP.array(expr, dtype='float')
except:
if not (name == 'input'):
print "Caught Failure on dataset with name %s: " % (name)
print sys.exc_info()[0]
# else:
# print "input is header and cannot be converted, this is NO error \r",
return data
[docs]def write_data_to_csv(data, path_to_file, header='GPTwoSample', delimiter=','):
"""
Write given data in training_data_structure (see :py:class:`gptwosample.data.data_base` for details)
into file for path_to_file.
**Parameters:**
data : dict
data to write in training_data_structure
path_to_file : String
The path to the file to write to
header : String
Name of the table
delimiter : character
delimiter for the csv file
"""
data = data.copy()
if not path_to_file.endswith(".csv"):
path_to_file.append(".csv")
out_file = open(path_to_file, "w")
writer = csv.writer(out_file)
line = [header]
line.extend(data.pop("input"))
writer.writerow(line)
for name, line in data.iteritems():
if line.shape[0] > 1:
l = [[name]] * line.shape[0]
l = SP.concatenate((l, line), axis=1)
writer.writerows(l)
else:
l = [name]
l = SP.concatenate((l, line), axis=1)
writer.writerow(l)
out_file.flush()
def count_lines(filename):
f = open(filename)
lines = 1
buf_size = 1024 * 1024
read_f = f.read
buf = read_f(buf_size)
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
return lines