Source code for intake_avro.source

from intake.source import base

from . import __version__


[docs]class AvroTableSource(base.DataSource): """ Source to load tabular avro datasets. Parameters ---------- urlpath: str Location of the data files; can include protocol and glob characters. """ version = __version__ container = 'dataframe' def __init__(self, urlpath, metadata=None, storage_options=None): self._urlpath = urlpath self._storage_options = storage_options or {} self._head = None super(AvroTableSource, self).__init__(metadata=metadata) def _get_schema(self): from dask.bytes.core import open_files import uavro.core as avrocore self._files = open_files(self._urlpath, mode='rb', **self._storage_options) if self._head is None: with self._files[0] as f: self._head = avrocore.read_header(f) dtypes = self._head['dtypes'] # avro schemas have a "namespace" and a "name" that could be metadata return base.Schema(datashape=None, dtype=dtypes, shape=(None, len(dtypes)), npartitions=len(self._files), extra_metadata={}) def _get_partition(self, i): return read_file_uavro(self._files[i], self._head)
[docs] def read(self): self._get_schema() return self.to_dask().compute()
[docs] def to_dask(self): """Create lazy dask dataframe object""" import dask.dataframe as dd from dask import delayed self.discover() dpart = delayed(read_file_uavro) return dd.from_delayed([dpart(f, self._head) for f in self._files], meta=self.dtype)
def read_file_uavro(f, head): import uavro.core as avrocore with f as f: f.seek(0, 2) size = f.tell() f.seek(0) return avrocore.filelike_to_dataframe(f, size, head, scan=True)
[docs]class AvroSequenceSource(base.DataSource): """ Source to load avro datasets as sequence of python dicts. Parameters ---------- urlpath: str Location of the data files; can include protocol and glob characters. """ version = __version__ container = 'python' def __init__(self, urlpath, metadata=None, storage_options=None): self._urlpath = urlpath self._storage_options = storage_options or {} self._head = None super(AvroSequenceSource, self).__init__(metadata=metadata) def _get_schema(self): from dask.bytes.core import open_files self._files = open_files(self._urlpath, mode='rb', **self._storage_options) # avro schemas have a "namespace" and a "name" that could be metadata return base.Schema(datashape=None, dtype=None, shape=None, npartitions=len(self._files), extra_metadata={}) def _get_partition(self, i): self._get_schema() return read_file_fastavro(self._files[i])
[docs] def read(self): self._get_schema() return self.to_dask().compute()
[docs] def to_dask(self): """Create lazy dask bag object""" from dask import delayed import dask.bag as db self._get_schema() dpart = delayed(read_file_fastavro) return db.from_delayed([dpart(f) for f in self._files])
def read_file_fastavro(f): import fastavro with f as f: return list(fastavro.reader(f))