from intake.source import base
from . import __version__
[docs]class AvroTableSource(base.DataSource):
"""
Source to load tabular avro datasets.
Parameters
----------
urlpath: str
Location of the data files; can include protocol and glob characters.
"""
version = __version__
container = 'dataframe'
def __init__(self, urlpath, metadata=None, storage_options=None):
self._urlpath = urlpath
self._storage_options = storage_options or {}
self._head = None
super(AvroTableSource, self).__init__(metadata=metadata)
def _get_schema(self):
from dask.bytes.core import open_files
import uavro.core as avrocore
self._files = open_files(self._urlpath, mode='rb',
**self._storage_options)
if self._head is None:
with self._files[0] as f:
self._head = avrocore.read_header(f)
dtypes = self._head['dtypes']
# avro schemas have a "namespace" and a "name" that could be metadata
return base.Schema(datashape=None,
dtype=dtypes,
shape=(None, len(dtypes)),
npartitions=len(self._files),
extra_metadata={})
def _get_partition(self, i):
return read_file_uavro(self._files[i], self._head)
[docs] def read(self):
self._get_schema()
return self.to_dask().compute()
[docs] def to_dask(self):
"""Create lazy dask dataframe object"""
import dask.dataframe as dd
from dask import delayed
self.discover()
dpart = delayed(read_file_uavro)
return dd.from_delayed([dpart(f, self._head) for f in self._files],
meta=self.dtype)
def read_file_uavro(f, head):
import uavro.core as avrocore
with f as f:
f.seek(0, 2)
size = f.tell()
f.seek(0)
return avrocore.filelike_to_dataframe(f, size, head, scan=True)
[docs]class AvroSequenceSource(base.DataSource):
"""
Source to load avro datasets as sequence of python dicts.
Parameters
----------
urlpath: str
Location of the data files; can include protocol and glob characters.
"""
version = __version__
container = 'python'
def __init__(self, urlpath, metadata=None, storage_options=None):
self._urlpath = urlpath
self._storage_options = storage_options or {}
self._head = None
super(AvroSequenceSource, self).__init__(metadata=metadata)
def _get_schema(self):
from dask.bytes.core import open_files
self._files = open_files(self._urlpath, mode='rb',
**self._storage_options)
# avro schemas have a "namespace" and a "name" that could be metadata
return base.Schema(datashape=None,
dtype=None,
shape=None,
npartitions=len(self._files),
extra_metadata={})
def _get_partition(self, i):
self._get_schema()
return read_file_fastavro(self._files[i])
[docs] def read(self):
self._get_schema()
return self.to_dask().compute()
[docs] def to_dask(self):
"""Create lazy dask bag object"""
from dask import delayed
import dask.bag as db
self._get_schema()
dpart = delayed(read_file_fastavro)
return db.from_delayed([dpart(f) for f in self._files])
def read_file_fastavro(f):
import fastavro
with f as f:
return list(fastavro.reader(f))