# Copyright (c) 2015 Sten Linnarsson
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import numpy as np
from typing import *
import h5py
import os.path
from scipy.io import mmread
import scipy.sparse
from shutil import copyfile
import logging
import time
[docs]def strip(s: str) -> str:
if s[0:2] == "b'" and s[-1] == "'":
return s[2:-1]
return s
[docs]def renumber(a: np.ndarray, keys: np.ndarray, values: np.ndarray) -> np.ndarray:
"""
Renumber 'a' by replacing any occurrence of 'keys' by the corresponding 'values'
"""
ordering = np.argsort(keys)
keys = keys[ordering]
values = keys[ordering]
index = np.digitize(a.ravel(), keys, right=True)
return(values[index].reshape(a.shape))
[docs]class LoomAttributeManager(object):
__slots__ = ('f',)
def __init__(self, f: h5py.File) -> None:
self.f = f
def __contains__(self, name: str) -> bool:
return self.f.attrs.__contains__(name)
def __setitem__(self, name: str, value: str) -> None:
if type(value) == bytes:
self.f.attrs[name] = value
else:
self.f.attrs[name] = value.encode('utf-8')
self.f.flush()
def __getitem__(self, name: str) -> str:
val = self.f.attrs[name]
if type(val) == bytes:
val = val.decode('utf-8')
else:
val = str(val)
# Fix cosmetic bugs accidentally introduced by Python 2/3 bug
if val[0:2] == "b'" and val[-1] == "'":
val = val[2:-1]
return val
def __iter__(self) -> Iterator[str]:
for val in self.f.attrs:
yield val
def __len__(self) -> int:
return len(self.f.attrs)
[docs] def get(self, name: str, default: str = None) -> str:
if self.__contains__(name):
return self[name]
else:
return default
[docs]class LoomConnection:
def __init__(self, filename: str, mode: str = 'r+') -> None:
"""
Establish a connection to a .loom file.
Args:
filename: Name of the .loom file to open
mode: read/write mode, accepts 'r+' (read/write) or
'r' (read-only), defaults to 'r+' without arguments,
and to 'r' with incorrect arguments
Returns:
Nothing.
Row and column attributes are loaded into memory for fast access.
"""
# make sure a valid mode was passed, if not default to read-only
# because you probably are doing something that you don't want to
if mode != 'r+' and mode != 'r':
logging.warn("Wrong mode passed to LoomConnection, using read-only to not destroy data")
mode = 'r'
self.mode = mode
self.filename = filename
self._file = h5py.File(filename, mode)
self.shape = [0, 0] # The correct shape gets assigned when the layers are loaded
if self._file.__contains__("/matrix"):
self.layer = {
"": LoomLayer(self, "", self._file["/matrix"].dtype)
}
self.shape = self._file["/matrix"].shape
if self._file.__contains__("/layers"):
for key in self._file["/layers"].keys():
self.layer[key] = LoomLayer(self, key, self._file["/layers/" + key].dtype)
else:
self.layer = {}
self.row_attrs = {} # type: Dict[str, np.ndarray]
for key in self._file['row_attrs'].keys():
self._load_attr(key, axis=0)
v = self.row_attrs[key]
if type(v[0]) is np.str_ and len(v[0]) >= 3 and v[0][:2] == "b'" and v[0][-1] == "'":
logging.warn("Unicode bug detected in row %s" % key)
if mode == 'r+':
logging.warn("Fixing unicode bug by re-setting row attribute '" + key + "'")
self._save_attr(key, np.array([x[2:-1] for x in v]), axis=0)
self._load_attr(key, axis=0)
self.col_attrs = {} # type: Dict[str, np.ndarray]
for key in self._file['col_attrs'].keys():
self._load_attr(key, axis=1)
v = self.col_attrs[key]
if len(v) > 0 and type(v[0]) is np.str_ and len(v[0]) >= 3 and v[0][:2] == "b'" and v[0][-1] == "'":
logging.warn("Unicode bug detected in column %s" % key)
if mode == 'r+':
logging.warn("Fixing unicode bug by re-setting column attribute '" + key + "'")
self._save_attr(key, np.array([x[2:-1] for x in v]), axis=1)
self._load_attr(key, axis=1)
self.attrs = LoomAttributeManager(self._file)
def _save_attr(self, name: str, values: np.ndarray, axis: int) -> None:
"""
Save an attribute to the file, nothing else
Remarks:
Handles unicode to ascii conversion (lossy, but HDF5 supports only ascii)
Does not update the attribute cache (use _load_attr for this)
"""
if self.mode != "r+":
raise IOError("Cannot save attributes when connected in read-only mode")
if values.dtype.type is np.str_:
values = np.array([x.encode('ascii', 'ignore') for x in values])
a = ["/row_attrs/", "/col_attrs/"][axis]
if self.shape[axis] != 0 and len(values) != self.shape[axis]:
raise ValueError("Attribute must have exactly %d values" % self.shape[axis])
if self._file[a].__contains__(name):
del self._file[a + name]
self._file[a + name] = values
self._file.flush()
def _load_attr(self, name: str, axis: int) -> None:
"""
Load an attribute from the file, nothing else
Remarks:
Handles ascii to unicode conversion
Updates the attribute cache as well as the class attributes
"""
a = ["/row_attrs/", "/col_attrs/"][axis]
def safe_decode(s: np.bytes_) -> str:
try:
return s.decode('utf8')
except UnicodeDecodeError:
return "[UTF-8 DECODING ERROR]"
if self._file[a][name].dtype.kind == 'S':
vals = np.array([safe_decode(x) for x in self._file[a][name][:]])
else:
vals = self._file[a][name][:]
if axis == 0:
self.row_attrs[name] = vals
if not hasattr(LoomConnection, name):
setattr(self, name, self.row_attrs[name])
else:
self.col_attrs[name] = vals
if not hasattr(LoomConnection, name):
setattr(self, name, self.col_attrs[name])
def _repr_html_(self) -> str:
"""
Return an HTML representation of the loom file, showing the upper-left 10x10 corner.
"""
rm = min(10, self.shape[0])
cm = min(10, self.shape[1])
html = "<p>"
if self.attrs.__contains__("title"):
html += "<strong>" + self.attrs["title"] + "</strong> "
html += "(" + str(self.shape[0]) + " genes, " + str(self.shape[1]) + " cells, " + str(len(self.layer)) + " layers)<br/>"
html += self._file.filename + "<br/>"
if self.attrs.__contains__("description"):
html += "<em>" + self.attrs["description"] + "</em><br/>"
html += "<table>"
# Emit column attributes
for ca in self.col_attrs.keys():
html += "<tr>"
for ra in self.row_attrs.keys():
html += "<td> </td>" # Space for row attrs
html += "<td><strong>" + ca + "</strong></td>" # Col attr name
for v in self.col_attrs[ca][:cm]:
html += "<td>" + str(v) + "</td>"
if self.shape[1] > cm:
html += "<td>...</td>"
html += "</tr>"
# Emit row attribute names
html += "<tr>"
for ra in self.row_attrs.keys():
html += "<td><strong>" + ra + "</strong></td>" # Row attr name
html += "<td> </td>" # Space for col attrs
for v in range(cm):
html += "<td> </td>"
if self.shape[1] > cm:
html += "<td>...</td>"
html += "</tr>"
# Emit row attr values and matrix values
for row in range(rm):
html += "<tr>"
for ra in self.row_attrs.keys():
html += "<td>" + str(self.row_attrs[ra][row]) + "</td>"
html += "<td> </td>" # Space for col attrs
for v in self[row, :cm]:
html += "<td>" + str(v) + "</td>"
if self.shape[1] > cm:
html += "<td>...</td>"
html += "</tr>"
# Emit ellipses
if self.shape[0] > rm:
html += "<tr>"
for v in range(rm + 1 + len(self.row_attrs.keys())):
html += "<td>...</td>"
if self.shape[1] > cm:
html += "<td>...</td>"
html += "</tr>"
html += "</table>"
return html
def __getitem__(self, slice: Tuple[Union[int, slice], Union[int, slice]]) -> np.ndarray:
"""
Get a slice of the main matrix.
Args:
slice: A slice object (see http://docs.h5py.org/en/latest/high/dataset.html)
Returns:
A numpy matrix
"""
return self.layer[""][slice]
def __setitem__(self, slice: Tuple[Union[int, slice], Union[int, slice]], data: np.ndarray) -> None:
"""
Assign a slice of the main matrix.
Args:
slice: A slice object (see http://docs.h5py.org/en/latest/high/dataset.html)
Returns:
Nothing.
"""
self.layer[""][slice] = data
[docs] def sparse(self, genes: np.ndarray = None, cells: np.ndarray = None, layer: str = None) -> scipy.sparse.coo_matrix:
if layer is None:
return self.layer[""].sparse(genes=genes, cells=cells)
else:
return self.layer[layer].sparse(genes=genes, cells=cells)
[docs] def close(self) -> None:
"""
Close the connection. After this, the connection object becomes invalid.
"""
self._file.close()
self._file = None
self.row_attrs = {}
self.col_attrs = {}
self.shape = [0, 0]
[docs] def set_layer(self, name: str, matrix: np.ndarray, chunks: Tuple[int, int] = (64, 64), chunk_cache: int = 512, dtype: str = "float32", compression_opts: int = 2) -> None:
if self.mode != "r+":
raise IOError("Cannot save attributes when connected in read-only mode")
if not np.isfinite(matrix).all():
raise ValueError("INF and NaN not allowed in loom matrix")
if not self._file.__contains__("/layers"):
self._file.create_group("/layers")
# make sure chunk size is not bigger than actual matrix size
chunks = (min(chunks[0], matrix.shape[0]), min(chunks[1], matrix.shape[1]))
path = "/layers/" + name
if name == "":
path = "/matrix"
if self._file.__contains__(path):
del self._file[path]
# Save the main matrix
if compression_opts is None:
self._file.create_dataset(
path,
data=matrix.astype(dtype),
maxshape=(matrix.shape[0], None),
chunks=chunks,
fletcher32=False
)
else:
self._file.create_dataset(
path,
data=matrix.astype(dtype),
maxshape=(matrix.shape[0], None),
chunks=chunks,
fletcher32=False,
compression="gzip",
shuffle=False,
compression_opts=compression_opts
)
self.layer[name] = LoomLayer(self, name, dtype)
if name == "":
self.shape = matrix.shape
self._file.flush()
[docs] def add_columns(self, submatrix: np.ndarray, col_attrs: Dict[str, np.ndarray], fill_values: Dict[str, np.ndarray] = None) -> None:
"""
Add columns of data and attribute values to the dataset.
Args:
submatrix (dict or numpy.ndarray):
Either:
1) A N-by-M matrix of float32s (N rows, M columns) in this case columns are added at the default layer
2) A dict {layer_name : matrix} specified so that the matrix (N, M) will be added to layer `layer_name`
col_attrs (dict):
Column attributes, where keys are attribute names and values are numpy arrays (float or string) of length M
Returns:
Nothing.
Notes
-----
- This will modify the underlying HDF5 file, which will interfere with any concurrent readers.
- Column attributes in the file that are NOT provided, will be deleted.
- Array with Nan should not be provided
"""
if self.mode != "r+":
raise IOError("Cannot add columns when connected in read-only mode")
if not type(submatrix) == dict:
submatrix_dict = dict()
submatrix_dict[""] = submatrix
else:
submatrix_dict = cast(dict, submatrix) # equivalent to submatrix_dict = submatrix # only avoids problems with type checker
submatrix = submatrix_dict[""]
# for k, v in submatrix_dict.items():
# if not np.isfinite(v).all():
# raise ValueError("INF and NaN not allowed in loom matrix")
if submatrix.shape[0] != self.shape[0]:
raise ValueError("New submatrix must have same number of rows as existing matrix")
did_remove = False
todel = [] # type: List[str]
for key, vals in col_attrs.items():
if key not in self.col_attrs:
if fill_values is not None:
if fill_values == "auto":
fill_with = np.zeros(1, dtype=col_attrs[key].dtype)[0]
else:
fill_with = fill_values[key]
self.set_attr(key, np.array([fill_with] * self.shape[1]), axis=1)
else:
did_remove = True
todel.append(key)
if len(vals) != submatrix.shape[1]:
raise ValueError("Each column attribute must have exactly %s values" % submatrix.shape[1])
for key in todel:
del col_attrs[key]
if did_remove:
logging.warn("Some column attributes were removed: " + ",".join(todel))
todel = []
did_remove = False
for key in self.col_attrs.keys():
if key not in col_attrs:
if fill_values is not None:
if fill_values == "auto":
fill_with = np.zeros(1, dtype=self.col_attrs[key].dtype)[0]
else:
fill_with = fill_values[key]
col_attrs[key] = np.array([fill_with] * submatrix.shape[1])
else:
did_remove = True
todel.append(key)
for key in todel:
self.delete_attr(key, axis=1)
if did_remove:
logging.warn("Some column attributes were removed: " + ",".join(todel))
n_cols = submatrix.shape[1] + self.shape[1]
for key, vals in col_attrs.items():
vals = np.array(vals)
if vals.dtype.type is np.str_:
vals = np.array([x.encode('ascii', 'ignore') for x in vals])
temp = self._file['/col_attrs/' + key][:]
casting_rule_dtype = np.result_type(temp, vals)
if vals.dtype != casting_rule_dtype:
vals = vals.astype(casting_rule_dtype)
if temp.dtype != casting_rule_dtype:
temp = temp.astype(casting_rule_dtype)
temp.resize((n_cols,))
temp[self.shape[1]:] = vals
del self._file['/col_attrs/' + key]
self._file['/col_attrs/' + key] = temp
self.col_attrs[key] = self._file['/col_attrs/' + key]
# Add the columns layerwise
for key in self.layer.keys():
self.layer[key].resize(n_cols, axis=1)
self.layer[key][:, self.shape[1]:n_cols] = submatrix_dict[key].astype(self.layer[key].dtype)
self._file.flush()
self.shape = [self.shape[0], n_cols]
[docs] def add_loom(self, other_file: str, key: str = None, fill_values: Dict[str, np.ndarray]=None, batch_size: int=1000) -> None:
"""
Add the content of another loom file
Args:
other_file (str): filename of the loom file to append
fill_values (dict): default values to use for missing attributes (or None to drop missing attrs, or 'auto' to fill with sensible defaults)
batch_size (int): the batch size used by batchscan (limits the number of rows/columns read in memory)
Returns:
Nothing, but adds the loom file. Note that the other loom file must have exactly the same
number of rows, and must have exactly the same column attributes.
The all the contents including layers but ignores layers in `other_file` that are not already persent in self
"""
if self.mode != "r+":
raise IOError("Cannot add data when connected in read-only mode")
# Connect to the loom files
other = connect(other_file)
# Verify that the row keys can be aligned
ordering = None
if key is not None:
# This was original Sten's version but it creates a 400M entries array in memory
# ordering = np.where(other.row_attrs[key][None, :] == self.row_attrs[key][:, None])[1]
def ixs_thatsort_a2b(a: np.ndarray, b: np.ndarray, check_content: bool=True) -> np.ndarray:
"This is super duper magic sauce to make the order of one list to be like another"
if check_content:
assert len(np.intersect1d(a, b)) == len(a), "The two arrays are not matching"
return np.argsort(a)[np.argsort(np.argsort(b))]
ordering = ixs_thatsort_a2b(a=other.row_attrs[key], b=self.row_attrs[key])
pk1 = sorted(other.row_attrs[key])
pk2 = sorted(self.row_attrs[key])
for ix, val in enumerate(pk1):
if pk2[ix] != val:
raise ValueError("Primary keys are not 1-to-1 alignable!")
diff_layers = set(self.layer.keys()) - set(other.layer.keys())
if len(diff_layers) > 0:
raise ValueError("%s is missing a layer, cannot merge with current file. layers missing:%s" % (other_file, diff_layers))
for (ix, selection, vals) in other.batch_scan_layers(axis=1, layers=self.layer.keys(), batch_size = batch_size):
ca = {key: v[selection] for key, v in other.col_attrs.items()}
if ordering is not None:
vals = {key: val[ordering, :] for key, val in vals.items()}
self.add_columns(vals, ca, fill_values)
other.close()
[docs] def delete_attr(self, name: str, axis: int = 0, raise_on_missing: bool = True) -> None:
"""
Permanently delete an existing attribute and all its values
Args:
name (str): Name of the attribute to remove
axis (int): Axis of the attribute (0 = rows, 1 = columns)
Returns:
Nothing.
"""
if self.mode != "r+":
raise IOError("Cannot delete attributes when connected in read-only mode")
if axis == 0:
if name not in self.row_attrs:
if raise_on_missing:
raise KeyError("Row attribute " + name + " does not exist")
else:
return
del self.row_attrs[name]
del self._file['/row_attrs/' + name]
if hasattr(self, name):
delattr(self, name)
elif axis == 1:
if name not in self.col_attrs:
if raise_on_missing:
raise KeyError("Column attribute " + name + " does not exist")
else:
return
del self.col_attrs[name]
del self._file['/col_attrs/' + name]
if hasattr(self, name):
delattr(self, name)
else:
raise ValueError("Axis must be 0 or 1")
self._file.flush()
[docs] def set_attr(self, name: str, values: np.ndarray, axis: int = 0, dtype: str = None) -> None:
"""
Create or modify an attribute.
Args:
name (str): Name of the attribute
values (numpy.ndarray): Array of values of length equal to the axis length
axis (int): Axis of the attribute (0 = rows, 1 = columns)
Returns:
Nothing.
This will overwrite any existing attribute of the same name.
"""
if self.mode != "r+":
raise IOError("Cannot save attributes when connected in read-only mode")
if dtype is not None:
raise DeprecationWarning("Data type should no longer be provided")
self.delete_attr(name, axis, raise_on_missing=False)
self._save_attr(name, values, axis)
self._load_attr(name, axis)
[docs] def list_edges(self, axis: int) -> List[str]:
if axis == 0 and "row_edges" in self._file:
return [k for k in self._file["row_edges"]]
elif axis == 1 and "col_edges" in self._file:
return [k for k in self._file["col_edges"]]
else:
return []
[docs] def get_edges(self, name: str, axis: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
if axis == 0:
return (self._file["/row_edges/" + name + "/a"][:], self._file["/row_edges/" + name + "/b"][:], self._file["/row_edges/" + name + "/w"][:])
if axis == 1:
return (self._file["/col_edges/" + name + "/a"][:], self._file["/col_edges/" + name + "/b"][:], self._file["/col_edges/" + name + "/w"][:])
raise ValueError("Axis must be 0 or 1")
[docs] def set_edges(self, name: str, a: np.ndarray, b: np.ndarray, w: np.ndarray, axis: int) -> None:
if self.mode != "r+":
raise IOError("Cannot save edges when connected in read-only mode")
if not a.dtype.kind == 'i':
raise ValueError("Nodes must be integers")
if not b.dtype.kind == 'i':
raise ValueError("Nodes must be integers")
if axis == 1:
if a.max() > self.shape[1] or a.min() < 0:
raise ValueError("Nodes out of range")
if b.max() > self.shape[1] or b.min() < 0:
raise ValueError("Nodes out of range")
if self._file.__contains__("/col_edges/" + name):
del self._file["/col_edges/" + name + "/a"]
del self._file["/col_edges/" + name + "/b"]
del self._file["/col_edges/" + name + "/w"]
self._file["/col_edges/" + name + "/a"] = a
self._file["/col_edges/" + name + "/b"] = b
self._file["/col_edges/" + name + "/w"] = w
elif axis == 0:
if a.max() > self.shape[0] or a.min() < 0:
raise ValueError("Nodes out of range")
if b.max() > self.shape[0] or b.min() < 0:
raise ValueError("Nodes out of range")
if self._file.__contains__("/row_edges/" + name):
del self._file["/row_edges/" + name + "/a"]
del self._file["/row_edges/" + name + "/b"]
del self._file["/row_edges/" + name + "/w"]
self._file["/row_edges/" + name + "/a"] = a
self._file["/row_edges/" + name + "/b"] = b
self._file["/row_edges/" + name + "/w"] = w
else:
raise ValueError("Axis must be 0 or 1")
[docs] def batch_scan(self, cells: np.ndarray = None, genes: np.ndarray = None, axis: int = 0, batch_size: int = 1000, layer: str = None) -> Iterable[Tuple[int, np.ndarray, np.ndarray]]:
"""Performs a batch scan of the loom file
Args
----
cells: np.ndarray
the indexes [1,2,3,..,1000] of the cells to select
genes: np.ndarray
the indexes [1,2,3,..,1000] of the genes to select
axis: int
0:rows or 1:cols
batch_size: int
the chuncks returned at every element of the iterator
Returns
------
Iterable that yields triplets
(ix, indexes, vals)
ix: int
first position / how many rows/cols have been yielded alredy
indexes: np.ndarray[int]
the indexes with the same numbering of the input args cells / genes (i.e. np.arange(len(ds.shape[axis])))
this is ix + selection
vals: np.ndarray
the matrix corresponding to the chunk
"""
if cells is None:
cells = np.fromiter(range(self.shape[1]), dtype='int')
if genes is None:
genes = np.fromiter(range(self.shape[0]), dtype='int')
if layer is None:
layer = ""
if axis == 1:
cols_per_chunk = batch_size
ix = 0
while ix < self.shape[1]:
cols_per_chunk = min(self.shape[1] - ix, cols_per_chunk)
selection = cells - ix
# Pick out the cells that are in this batch
selection = selection[np.where(np.logical_and(selection >= 0, selection < cols_per_chunk))[0]]
if selection.shape[0] == 0:
ix += cols_per_chunk
continue
# Load the whole chunk from the file, then extract genes and cells using fancy indexing
vals = self.layer[layer][:, ix:ix + cols_per_chunk]
vals = vals[genes, :]
vals = vals[:, selection]
yield (ix, ix + selection, vals)
ix += cols_per_chunk
if axis == 0:
rows_per_chunk = batch_size
ix = 0
while ix < self.shape[0]:
rows_per_chunk = min(self.shape[0] - ix, rows_per_chunk)
selection = genes - ix
# Pick out the genes that are in this batch
selection = selection[np.where(np.logical_and(selection >= 0, selection < rows_per_chunk))[0]]
if selection.shape[0] == 0:
ix += rows_per_chunk
continue
# Load the whole chunk from the file, then extract genes and cells using fancy indexing
vals = self.layer[layer][ix:ix + rows_per_chunk, :]
vals = vals[selection, :]
vals = vals[:, cells]
yield (ix, ix + selection, vals)
ix += rows_per_chunk
[docs] def batch_scan_layers(self, cells: np.ndarray = None, genes: np.ndarray = None, axis: int = 0, batch_size: int = 1000, layers: Iterable = None) -> Iterable[Tuple[int, np.ndarray, Dict]]:
"""Performs a batch scan of the loom file dealing with multiple layer files
Args
----
cells: np.ndarray
the indexes [1,2,3,..,1000] of the cells to select
genes: np.ndarray
the indexes [1,2,3,..,1000] of the genes to select
axis: int
0:rows or 1:cols
batch_size: int
the chuncks returned at every element of the iterator
layers: iterable
if specified it will batch scan only accross some of the layers of the loom file
i.g. if layers = [""] batch_scan_layers is equivalent to batch_scan
Returns
------
Iterable that yields triplets
(ix, indexes, vals)
ix: int
first position / how many rows/cols have been yielded alredy
indexes: np.ndarray[int]
the indexes with the same numbering of the input args cells / genes (i.e. np.arange(len(ds.shape[axis])))
this is ix + selection
vals: Dict[layername, np.ndarray]
a dictionary of the matrixes corresponding to the chunks of different layers
"""
if cells is None:
cells = np.fromiter(range(self.shape[1]), dtype='int')
if genes is None:
genes = np.fromiter(range(self.shape[0]), dtype='int')
if layers is None:
layers = self.layer.keys()
if axis == 1:
cols_per_chunk = batch_size
ix = 0
while ix < self.shape[1]:
cols_per_chunk = min(self.shape[1] - ix, cols_per_chunk)
selection = cells - ix
# Pick out the cells that are in this batch
selection = selection[np.where(np.logical_and(selection >= 0, selection < cols_per_chunk))[0]]
if selection.shape[0] == 0:
ix += cols_per_chunk
continue
# Load the whole chunk from the file, then extract genes and cells using fancy indexing
vals = dict()
for key in layers:
vals[key] = self.layer[key][:, ix:ix + cols_per_chunk]
vals[key] = vals[key][genes, :]
vals[key] = vals[key][:, selection]
yield (ix, ix + selection, vals)
ix += cols_per_chunk
if axis == 0:
rows_per_chunk = batch_size
ix = 0
while ix < self.shape[0]:
rows_per_chunk = min(self.shape[0] - ix, rows_per_chunk)
selection = genes - ix
# Pick out the genes that are in this batch
selection = selection[np.where(np.logical_and(selection >= 0, selection < rows_per_chunk))[0]]
if selection.shape[0] == 0:
ix += rows_per_chunk
continue
# Load the whole chunk from the file, then extract genes and cells using fancy indexing
vals = dict()
for key in layers:
vals[key] = self.layer[key][ix:ix + rows_per_chunk, :]
vals[key] = vals[key][selection, :]
vals[key] = vals[key][:, cells]
yield (ix, ix + selection, vals)
ix += rows_per_chunk
[docs] def map(self, f_list: List[Callable[[np.ndarray], int]], axis: int = 0, chunksize: int = 1000, selection: np.ndarray = None) -> List[np.ndarray]:
"""
Apply a function along an axis without loading the entire dataset in memory.
Args:
f (list of func): Function(s) that takes a numpy ndarray as argument
axis (int): Axis along which to apply the function (0 = rows, 1 = columns)
chunksize (int): Number of rows (columns) to load per chunk
selection (array of bool): Columns (rows) to include
Returns:
numpy.ndarray result of function application
If you supply a list of functions, the result will be a list of numpy arrays. This is more
efficient than repeatedly calling map() one function at a time.
"""
if hasattr(f_list, '__call__'):
raise ValueError("f_list must be a list of functions, not a function itself")
result = []
if axis == 0:
rows_per_chunk = chunksize
for i in range(len(f_list)):
result.append(np.zeros(self.shape[0]))
ix = 0
while ix < self.shape[0]:
rows_per_chunk = min(self.shape[0] - ix, rows_per_chunk)
if selection is not None:
chunk = self[ix:ix + rows_per_chunk, :][:, selection]
else:
chunk = self[ix:ix + rows_per_chunk, :]
for i in range(len(f_list)):
result[i][ix:ix + rows_per_chunk] = np.apply_along_axis(f_list[i], 1, chunk)
ix = ix + rows_per_chunk
elif axis == 1:
cols_per_chunk = chunksize
for i in range(len(f_list)):
result.append(np.zeros(self.shape[1]))
ix = 0
while ix < self.shape[1]:
cols_per_chunk = min(self.shape[1] - ix, cols_per_chunk)
if selection is not None:
chunk = self[:, ix:ix + cols_per_chunk][selection, :]
else:
chunk = self[:, ix:ix + cols_per_chunk]
for i in range(len(f_list)):
result[i][ix:ix + cols_per_chunk] = np.apply_along_axis(f_list[i], 0, chunk)
ix = ix + cols_per_chunk
return result
[docs] def permute(self, ordering: np.ndarray, axis: int) -> None:
"""
Permute the dataset along the indicated axis.
Args:
ordering (list of int): The desired order along the axis
axis (int): The axis along which to permute
Returns:
Nothing.
"""
if self._file.__contains__("tiles"):
del self._file['tiles']
ordering = list(np.array(ordering).flatten()) # Flatten the ordering, in case we got a column vector
if axis == 0:
for layer in self.layer:
if layer == "":
obj = self._file['/matrix']
else:
obj = self._file['/layers/' + layer]
chunksize = 5000
start = 0
while start < self.shape[1]:
submatrix = obj[:, start:start + chunksize]
obj[:, start:start + chunksize] = submatrix[ordering, :]
start = start + chunksize
for key in list(self.row_attrs.keys()):
self.set_attr(key, self.row_attrs[key][ordering], axis=0)
# Permute the edges
for name in self.list_edges(axis=0):
(a, b, w) = self.get_edges(name, axis=0)
a = renumber(a, np.array(ordering), np.arange(self.shape[0]))
b = renumber(b, np.array(ordering), np.arange(self.shape[0]))
self.set_edges(name, a, b, w, 0)
self._file.flush()
if axis == 1:
# Permute the columns of each layer
for layer in self.layer:
if layer == "":
obj = self._file['/matrix']
else:
obj = self._file['/layers/' + layer]
chunksize = 100000000 // self.shape[1]
start = 0
while start < self.shape[0]:
submatrix = obj[start:start + chunksize, :]
obj[start:start + chunksize, :] = submatrix[:, ordering]
start = start + chunksize
# Permute the column attributes
for key in list(self.col_attrs.keys()):
self.set_attr(key, self.col_attrs[key][ordering], axis=1)
# Permute the edges
for name in self.list_edges(axis=1):
(a, b, w) = self.get_edges(name, axis=1)
a = renumber(a, np.array(ordering), np.arange(self.shape[1]))
b = renumber(b, np.array(ordering), np.arange(self.shape[1]))
self.set_edges(name, a, b, w, 1)
self._file.flush()
[docs] def export(self, out_file: str, layer: str = None, format: str = "tab") -> None:
if format != "tab":
raise NotImplementedError("Only 'tab' is supported")
with open(out_file, "w") as f:
# Emit column attributes
for ca in self.col_attrs.keys():
for ra in self.row_attrs.keys():
f.write("\t")
f.write(ca + "\t")
for v in self.col_attrs[ca]:
f.write(str(v) + "\t")
f.write("\n")
# Emit row attribute names
for ra in self.row_attrs.keys():
f.write(ra + "\t")
f.write("\t")
for v in range(self.shape[1]):
f.write("\t")
f.write("\n")
# Emit row attr values and matrix values
for row in range(self.shape[0]):
for ra in self.row_attrs.keys():
f.write(str(self.row_attrs[ra][row]) + "\t")
f.write("\t")
if layer is None:
for v in self[row, :]:
f.write(str(v) + "\t")
else:
for v in self.layer[layer][row, :]:
f.write(str(v) + "\t")
f.write("\n")
[docs]class LoomLayer():
def __init__(self, ds: LoomConnection, name: str, dtype: str) -> None:
self.ds = ds
self.name = name
self.dtype = dtype
self.shape = ds.shape
def __getitem__(self, slice: Tuple[Union[int, slice], Union[int, slice]]) -> np.ndarray:
if self.name == "":
return self.ds._file['/matrix'].__getitem__(slice)
return self.ds._file['/layers/' + self.name].__getitem__(slice)
def __setitem__(self, slice: Tuple[Union[int, slice], Union[int, slice]], data: np.ndarray) -> None:
if self.name == "":
self.ds._file['/matrix'].__setitem__(slice, data.astype(self.dtype))
else:
self.ds._file['/layers/' + self.name].__setitem__(slice, data.astype(self.dtype))
[docs] def sparse(self, genes: np.ndarray, cells: np.ndarray) -> scipy.sparse.coo_matrix:
n_genes = self.ds.shape[0] if genes is None else genes.shape[0]
n_cells = self.ds.shape[1] if cells is None else cells.shape[0]
data: np.ndarray = None
row: np.ndarray = None
col: np.ndarray = None
for (ix, selection, vals) in self.ds.batch_scan(genes=genes, cells=cells, axis=1, layer=self.name):
nonzeros = np.where(vals > 0)
if data is None:
data = vals[nonzeros]
row = nonzeros[0]
col = selection[nonzeros[1]]
else:
data = np.concatenate([data, vals[nonzeros]])
row = np.concatenate([row, nonzeros[0]])
col = np.concatenate([col, selection[nonzeros[1]]])
return scipy.sparse.coo_matrix((data, (row, col)), shape=(n_genes, n_cells))
[docs] def resize(self, size: Tuple[int, int], axis: int = None) -> None:
"""Resize the dataset, or the specified axis.
The dataset must be stored in chunked format; it can be resized up to the "maximum shape" (keyword maxshape) specified at creation time.
The rank of the dataset cannot be changed.
"Size" should be a shape tuple, or if an axis is specified, an integer.
BEWARE: This functions differently than the NumPy resize() method!
The data is not "reshuffled" to fit in the new shape; each axis is grown or shrunk independently.
The coordinates of existing data are fixed.
"""
if self.name == "":
self.ds._file['/matrix'].resize(size, axis)
else:
self.ds._file['/layers/' + self.name].resize(size, axis)
def _create_sparse(filename: str, matrix: np.ndarray, row_attrs: Dict[str, np.ndarray], col_attrs: Dict[str, np.ndarray], file_attrs: Dict[str, str] = None, chunks: Tuple[int, int] = (64, 64), chunk_cache: int = 512, dtype: str = "float32", compression_opts: int = 2) -> LoomConnection:
logging.info("Converting to csc format")
matrix = matrix.tocsc()
window = 2000
ix = 0
ds = None # type: LoomConnection
while ix < matrix.shape[1]:
ca = {key: val[ix:ix + window] for (key, val) in col_attrs.items()}
if ds is None:
logging.info("Creating")
ds = create(filename, matrix[:, ix:ix + window].toarray(), row_attrs, ca, file_attrs, chunks, chunk_cache, dtype, compression_opts)
else:
logging.info("Adding columns")
ds.add_columns(matrix[:, ix:ix + window].toarray(), ca)
ix += window
return ds
[docs]def create(filename: str, matrix: np.ndarray, row_attrs: Dict[str, np.ndarray], col_attrs: Dict[str, np.ndarray], file_attrs: Dict[str, str] = None, chunks: Tuple[int, int] = (64, 64), chunk_cache: int = 512, dtype: str = "float32", compression_opts: int = 2) -> LoomConnection:
"""
Create a new .loom file from the given data.
Args:
filename (str): The filename (typically using a `.loom` file extension)
matrix (numpy.ndarray): Two-dimensional (N-by-M) numpy ndarray of float values
row_attrs (dict): Row attributes, where keys are attribute names and values
are numpy arrays (float or string) of length N
col_attrs (dict): Column attributes, where keys are attribute names and
values are numpy arrays (float or string) of length M
chunks (tuple): The chunking of the matrix. Small chunks are slow
when loading a large batch of rows/columns in sequence,
but fast for single column/row retrieval.
Defaults to (64,64).
chunk_cache (int): Sets the chunk cache used by the HDF5 format inside
the loom file, in MB. If the cache is too small to
contain all chunks of a row/column in memory, then
sequential row/column access will be a lot slower.
Defaults to 512.
dtype (str): Dtype of the matrix. Default float32 (uint16, float16 could be used)
compression_opts (int): Strenght of the gzip compression. Default None.
Returns:
LoomConnection to created loom file.
"""
if file_attrs is None:
file_attrs = {}
if scipy.sparse.issparse(matrix):
return _create_sparse(filename, matrix, row_attrs, col_attrs, file_attrs, chunks, chunk_cache, dtype, compression_opts)
# Create the file (empty).
f = h5py.File(name=filename, mode='w')
f.create_group('/layers')
f.create_group('/row_attrs')
f.create_group('/col_attrs')
f.flush()
f.close()
ds = connect(filename)
ds.set_layer("", matrix, chunks, chunk_cache, dtype, compression_opts)
for key, vals in row_attrs.items():
ds.set_attr(key, vals, axis=0)
for key, vals in col_attrs.items():
ds.set_attr(key, vals, axis=1)
for vals in file_attrs:
ds.attrs[vals] = file_attrs[vals]
# store creation date
currentTime = time.localtime(time.time())
ds.attrs['creation_date'] = time.strftime('%Y/%m/%d %H:%M:%S', currentTime)
ds.attrs['chunks'] = str(chunks)
return ds
[docs]def create_from_cellranger(indir: str, outdir: str = None, genome: str = None) -> LoomConnection:
"""
Create a .loom file from 10X Genomics cellranger output
Args:
indir (str): path to the cellranger output folder (the one that contains 'outs')
outdir (str): output folder wher the new loom file should be saved (default to indir)
genome (str): genome build to load (e.g. 'mm10'; default: determine species from outs folder)
Returns:
Nothing, but creates loom_file
"""
if outdir is None:
outdir = indir
sampleid = os.path.split(os.path.abspath(indir))[-1]
matrix_folder = os.path.join(indir, 'outs', 'filtered_gene_bc_matrices')
if genome is None:
genome = [f for f in os.listdir(matrix_folder) if not f.startswith(".")][0]
matrix_folder = os.path.join(matrix_folder, genome)
matrix = mmread(os.path.join(matrix_folder, "matrix.mtx")).astype("float32").todense()
with open(os.path.join(matrix_folder, "genes.tsv"), "r") as f:
lines = f.readlines()
accession = np.array([x.split("\t")[0] for x in lines]).astype("str")
gene = np.array([x.split("\t")[1].strip() for x in lines]).astype("str")
with open(os.path.join(matrix_folder, "barcodes.tsv"), "r") as f:
lines = f.readlines()
cellids = np.array([sampleid + ":" + x.strip() for x in lines]).astype("str")
col_attrs = {"CellID": cellids}
row_attrs = {"Accession": accession, "Gene": gene}
tsne_file = os.path.join(indir, "outs", "analysis", "tsne", "projection.csv")
# In cellranger V2 the file moved one level deeper
if not os.path.exists(tsne_file):
tsne_file = os.path.join(indir, "outs", "analysis", "tsne", "2_components", "projection.csv")
if os.path.exists(tsne_file):
tsne = np.loadtxt(tsne_file, usecols=(1, 2), delimiter=',', skiprows=1)
col_attrs["_X"] = tsne[:, 0].astype('float32')
col_attrs["_Y"] = tsne[:, 1].astype('float32')
clusters_file = os.path.join(indir, "outs", "analysis", "clustering", "graphclust", "clusters.csv")
if os.path.exists(clusters_file):
labels = np.loadtxt(clusters_file, usecols=(1, ), delimiter=',', skiprows=1)
col_attrs["Clusters"] = labels.astype('int') - 1
return create(os.path.join(outdir, sampleid + ".loom"), matrix, row_attrs, col_attrs, {"Genome": genome})
[docs]def combine(files: List[str], output_file: str, key: str = None, file_attrs: Dict[str, str]=None, batch_size: int=1000) -> None:
"""
Combine two or more loom files and save as a new loom file
Args:
files (list of str): the list of input files (full paths)
output_file (str): full path of the output loom file
key (string): Row attribute to use to verify row ordering
file_attrs (dict): file attributes (title, description, url, etc.)
batch_size (int): limits the batch or cols/rows read in memory (default: 1000)
Returns:
Nothing, but creates a new loom file combining the input files.
The input files must (1) have exactly the same number of rows, (2) have
exactly the same sets of row and column attributes.
"""
if file_attrs is None:
file_attrs = {}
if len(files) == 0:
raise ValueError("The input file list was empty")
copyfile(files[0], output_file)
ds = connect(output_file)
for a in file_attrs:
ds.attrs[a] = file_attrs[a]
if len(files) >= 2:
for f in files[1:]:
ds.add_loom(f, key, batch_size=batch_size)
ds.close()
[docs]def connect(filename: str, mode: str = 'r+') -> LoomConnection:
"""
Establish a connection to a .loom file.
Args:
filename (str): Name of the .loom file to open
mode (str): read/write mode, accepts 'r+' (read/write) or
'r' (read-only), defaults to 'r+'
Returns:
A LoomConnection instance.
"""
return LoomConnection(filename, mode)