Module redvox.common.api_reader
Read Redvox data from a single directory Data files can be either API 900 or API 1000 data formats
Expand source code
"""
Read Redvox data from a single directory
Data files can be either API 900 or API 1000 data formats
"""
from typing import List, Optional
from datetime import timedelta
import multiprocessing
import multiprocessing.pool
import redvox.api1000.proto.redvox_api_m_pb2 as api_m
from redvox.common import offset_model
from redvox.common import api_conversions as ac
from redvox.common import io
from redvox.common import file_statistics as fs
from redvox.common.parallel_utils import maybe_parallel_map
from redvox.common.station import Station
from redvox.common.errors import RedVoxExceptions
class ApiReader:
"""
Reads data from api 900 or api 1000 format, converting all data read into RedvoxPacketM for
ease of comparison and use.
Properties:
filter: io.ReadFilter with the station ids, start and end time, start and end time padding, and
types of files to read
base_dir: str of the directory containing all the files to read
structured_dir: bool, if True, the base_dir contains a specific directory structure used by the
respective api formats. If False, base_dir only has the data files. Default False.
files_index: io.Index of the files that match the filter that are in base_dir
index_summary: io.IndexSummary of the filtered data
debug: bool, if True, output additional information during function execution. Default False.
"""
def __init__(
self,
base_dir: str,
structured_dir: bool = False,
read_filter: io.ReadFilter = None,
debug: bool = False,
pool: Optional[multiprocessing.pool.Pool] = None,
):
"""
Initialize the ApiReader object
:param base_dir: directory containing the files to read
:param structured_dir: if True, base_dir contains a specific directory structure used by the respective
api formats. If False, base_dir only has the data files. Default False.
:param read_filter: ReadFilter for the data files, if None, get everything. Default None
:param debug: if True, output additional statements during function execution. Default False.
"""
_pool: multiprocessing.pool.Pool = (
multiprocessing.Pool() if pool is None else pool
)
if read_filter:
self.filter = read_filter
if self.filter.station_ids:
self.filter.station_ids = set(self.filter.station_ids)
else:
self.filter = io.ReadFilter()
self.base_dir = base_dir
self.structured_dir = structured_dir
self.debug = debug
self.errors = RedVoxExceptions("APIReader")
self.files_index = self._get_all_files(_pool)
self.index_summary = io.IndexSummary.from_index(self._flatten_files_index())
if debug:
self.errors.print()
if pool is None:
_pool.close()
def _flatten_files_index(self):
"""
:return: flattened version of files_index
"""
result = io.Index()
for i in self.files_index:
result.append(i.entries)
return result
def _get_all_files(
self, pool: Optional[multiprocessing.pool.Pool] = None
) -> List[io.Index]:
"""
get all files in the base dir of the ApiReader
:return: index with all the files that match the filter
"""
_pool: multiprocessing.pool.Pool = (
multiprocessing.Pool() if pool is None else pool
)
index: List[io.Index] = []
# this guarantees that all ids we search for are valid
all_index = self._apply_filter(pool=_pool)
for station_id in all_index.summarize().station_ids():
id_index = all_index.get_index_for_station_id(station_id)
checked_index = self._check_station_stats(id_index, pool=_pool)
index.extend(checked_index)
if pool is None:
_pool.close()
return index
def _apply_filter(
self,
reader_filter: Optional[io.ReadFilter] = None,
pool: Optional[multiprocessing.pool.Pool] = None,
) -> io.Index:
"""
apply the filter of the reader, or another filter if specified
:param reader_filter: optional filter; if None, use the reader's filter, default None
:return: index of the filtered files
"""
_pool: multiprocessing.pool.Pool = (
multiprocessing.Pool() if pool is None else pool
)
if not reader_filter:
reader_filter = self.filter
if self.structured_dir:
index = io.index_structured(self.base_dir, reader_filter, pool=_pool)
else:
index = io.index_unstructured(self.base_dir, reader_filter, pool=_pool)
if pool is None:
_pool.close()
return index
def _check_station_stats(
self,
station_index: io.Index,
pool: Optional[multiprocessing.pool.Pool] = None,
) -> List[io.Index]:
"""
check the index's results; if it has enough information, return it, otherwise search for more data.
The index should only request one station id
If the station was restarted during the request period, a new group of indexes will be created
to represent the change in station metadata.
:param station_index: index representing the requested information
:return: List of Indexes that includes as much information as possible that fits the request
"""
_pool: multiprocessing.pool.Pool = multiprocessing.Pool() if pool is None else pool
# if we found nothing, return the index
if len(station_index.entries) < 1:
return [station_index]
stats = fs.extract_stats(station_index, pool=_pool)
# Close pool if created here
if pool is None:
_pool.close()
timing_offsets: Optional[offset_model.TimingOffsets] = offset_model.compute_offsets(stats)
# punt if duration or other important values are invalid or if the latency array was empty
if timing_offsets is None:
return [station_index]
diff_s = diff_e = timedelta(seconds=0)
# if our filtered files do not encompass the request even when the packet times are updated
# try getting 1.5 times the difference of the expected start/end and the start/end of the data
insufficient_str = ""
if self.filter.start_dt and timing_offsets.adjusted_start > self.filter.start_dt:
insufficient_str += f" {self.filter.start_dt} (start)"
# diff_s = self.filter.start_dt_buf + 1.5 * (timing_offsets.adjusted_start - self.filter.start_dt)
new_end = self.filter.start_dt - self.filter.start_dt_buf
new_start = new_end - 1.5 * (timing_offsets.adjusted_start - self.filter.start_dt)
new_index = self._apply_filter(io.ReadFilter()
.with_start_dt(new_start)
.with_end_dt(new_end)
.with_extensions(self.filter.extensions)
.with_api_versions(self.filter.api_versions)
.with_station_ids(set(station_index.summarize().station_ids()))
.with_start_dt_buf(diff_s)
.with_end_dt_buf(diff_e))
if len(new_index.entries) > 0:
station_index.append(new_index.entries)
stats.extend(fs.extract_stats(new_index))
if self.filter.end_dt and timing_offsets.adjusted_end < self.filter.end_dt:
insufficient_str += f" {self.filter.end_dt} (end)"
# diff_e = self.filter.end_dt_buf + 1.5 * (self.filter.end_dt - timing_offsets.adjusted_end)
new_start = self.filter.end_dt + self.filter.end_dt_buf
new_end = new_start + 1.5 * (self.filter.end_dt - timing_offsets.adjusted_end)
new_index = self._apply_filter(io.ReadFilter()
.with_start_dt(new_start)
.with_end_dt(new_end)
.with_extensions(self.filter.extensions)
.with_api_versions(self.filter.api_versions)
.with_station_ids(set(station_index.summarize().station_ids()))
.with_start_dt_buf(diff_s)
.with_end_dt_buf(diff_e))
if len(new_index.entries) > 0:
station_index.append(new_index.entries)
stats.extend(fs.extract_stats(new_index))
if len(insufficient_str) > 0:
self.errors.append(f"Data for {station_index.summarize().station_ids()} exists, "
f"but not at:{insufficient_str}")
results = {}
keys = []
for v, e in enumerate(stats):
key = e.app_start_dt
if key not in keys:
keys.append(key)
results[key] = io.Index()
results[key].append(entries=[station_index.entries[v]])
return list(results.values())
@staticmethod
def read_files_in_index(indexf: io.Index) -> List[api_m.RedvoxPacketM]:
"""
read all the files in the index
:return: list of RedvoxPacketM, converted from API 900 if necessary
"""
result: List[api_m.RedvoxPacketM] = []
# Iterate over the API 900 packets in a memory efficient way
# and convert to API 1000
# noinspection PyTypeChecker
for packet_900 in indexf.stream_raw(
io.ReadFilter.empty().with_api_versions({io.ApiVersion.API_900})
):
# noinspection Mypy
result.append(
ac.convert_api_900_to_1000_raw(packet_900)
)
# Grab the API 1000 packets
# noinspection PyTypeChecker
for packet in indexf.stream_raw(
io.ReadFilter.empty().with_api_versions({io.ApiVersion.API_1000})
):
# noinspection Mypy
result.append(packet)
return result
# noinspection PyTypeChecker
def read_files_by_id(self, station_id: str) -> Optional[List[api_m.RedvoxPacketM]]:
"""
:param station_id: the id to filter on
:return: the list of packets with the requested id, or None if the id can't be found
"""
result: List[api_m.RedvoxPacketM] = []
# Iterate over the API 900 packets in a memory efficient way
# and convert to API 1000
for packet_900 in self._flatten_files_index().stream_raw(
io.ReadFilter.empty()
.with_api_versions({io.ApiVersion.API_900})
.with_station_ids({station_id})
):
# noinspection Mypy
result.append(ac.convert_api_900_to_1000_raw(packet_900))
# Grab the API 1000 packets
for packet in self._flatten_files_index().stream_raw(
io.ReadFilter.empty()
.with_api_versions({io.ApiVersion.API_1000})
.with_station_ids({station_id})
):
# noinspection Mypy
result.append(packet)
if len(result) == 0:
return None
return result
def _stations_by_index(self, findex: io.Index) -> Station:
"""
:param findex: index with files to build a station with
:return: Station built from files in findex
"""
return Station(self.read_files_in_index(findex))
def get_stations(self, pool: Optional[multiprocessing.pool.Pool] = None) -> List[Station]:
"""
:param pool: optional multiprocessing pool
:return: List of all stations in the ApiReader
"""
return list(maybe_parallel_map(pool,
self._stations_by_index,
self.files_index,
chunk_size=1
)
)
def get_station_by_id(self, get_id: str) -> Optional[List[Station]]:
"""
:param get_id: the id to filter on
:return: list of all stations with the requested id or None if id can't be found
"""
result = [s for s in self.get_stations() if s.id == get_id]
if len(result) < 1:
return None
return result
Classes
class ApiReader (base_dir: str, structured_dir: bool = False, read_filter: ReadFilter = None, debug: bool = False, pool: Optional[multiprocessing.pool.Pool] = None)
-
Reads data from api 900 or api 1000 format, converting all data read into RedvoxPacketM for ease of comparison and use.
Properties
filter: io.ReadFilter with the station ids, start and end time, start and end time padding, and types of files to read base_dir: str of the directory containing all the files to read structured_dir: bool, if True, the base_dir contains a specific directory structure used by the respective api formats. If False, base_dir only has the data files. Default False. files_index: io.Index of the files that match the filter that are in base_dir index_summary: io.IndexSummary of the filtered data debug: bool, if True, output additional information during function execution. Default False.
Initialize the ApiReader object
:param base_dir: directory containing the files to read :param structured_dir: if True, base_dir contains a specific directory structure used by the respective api formats. If False, base_dir only has the data files. Default False. :param read_filter: ReadFilter for the data files, if None, get everything. Default None :param debug: if True, output additional statements during function execution. Default False.
Expand source code
class ApiReader: """ Reads data from api 900 or api 1000 format, converting all data read into RedvoxPacketM for ease of comparison and use. Properties: filter: io.ReadFilter with the station ids, start and end time, start and end time padding, and types of files to read base_dir: str of the directory containing all the files to read structured_dir: bool, if True, the base_dir contains a specific directory structure used by the respective api formats. If False, base_dir only has the data files. Default False. files_index: io.Index of the files that match the filter that are in base_dir index_summary: io.IndexSummary of the filtered data debug: bool, if True, output additional information during function execution. Default False. """ def __init__( self, base_dir: str, structured_dir: bool = False, read_filter: io.ReadFilter = None, debug: bool = False, pool: Optional[multiprocessing.pool.Pool] = None, ): """ Initialize the ApiReader object :param base_dir: directory containing the files to read :param structured_dir: if True, base_dir contains a specific directory structure used by the respective api formats. If False, base_dir only has the data files. Default False. :param read_filter: ReadFilter for the data files, if None, get everything. Default None :param debug: if True, output additional statements during function execution. Default False. """ _pool: multiprocessing.pool.Pool = ( multiprocessing.Pool() if pool is None else pool ) if read_filter: self.filter = read_filter if self.filter.station_ids: self.filter.station_ids = set(self.filter.station_ids) else: self.filter = io.ReadFilter() self.base_dir = base_dir self.structured_dir = structured_dir self.debug = debug self.errors = RedVoxExceptions("APIReader") self.files_index = self._get_all_files(_pool) self.index_summary = io.IndexSummary.from_index(self._flatten_files_index()) if debug: self.errors.print() if pool is None: _pool.close() def _flatten_files_index(self): """ :return: flattened version of files_index """ result = io.Index() for i in self.files_index: result.append(i.entries) return result def _get_all_files( self, pool: Optional[multiprocessing.pool.Pool] = None ) -> List[io.Index]: """ get all files in the base dir of the ApiReader :return: index with all the files that match the filter """ _pool: multiprocessing.pool.Pool = ( multiprocessing.Pool() if pool is None else pool ) index: List[io.Index] = [] # this guarantees that all ids we search for are valid all_index = self._apply_filter(pool=_pool) for station_id in all_index.summarize().station_ids(): id_index = all_index.get_index_for_station_id(station_id) checked_index = self._check_station_stats(id_index, pool=_pool) index.extend(checked_index) if pool is None: _pool.close() return index def _apply_filter( self, reader_filter: Optional[io.ReadFilter] = None, pool: Optional[multiprocessing.pool.Pool] = None, ) -> io.Index: """ apply the filter of the reader, or another filter if specified :param reader_filter: optional filter; if None, use the reader's filter, default None :return: index of the filtered files """ _pool: multiprocessing.pool.Pool = ( multiprocessing.Pool() if pool is None else pool ) if not reader_filter: reader_filter = self.filter if self.structured_dir: index = io.index_structured(self.base_dir, reader_filter, pool=_pool) else: index = io.index_unstructured(self.base_dir, reader_filter, pool=_pool) if pool is None: _pool.close() return index def _check_station_stats( self, station_index: io.Index, pool: Optional[multiprocessing.pool.Pool] = None, ) -> List[io.Index]: """ check the index's results; if it has enough information, return it, otherwise search for more data. The index should only request one station id If the station was restarted during the request period, a new group of indexes will be created to represent the change in station metadata. :param station_index: index representing the requested information :return: List of Indexes that includes as much information as possible that fits the request """ _pool: multiprocessing.pool.Pool = multiprocessing.Pool() if pool is None else pool # if we found nothing, return the index if len(station_index.entries) < 1: return [station_index] stats = fs.extract_stats(station_index, pool=_pool) # Close pool if created here if pool is None: _pool.close() timing_offsets: Optional[offset_model.TimingOffsets] = offset_model.compute_offsets(stats) # punt if duration or other important values are invalid or if the latency array was empty if timing_offsets is None: return [station_index] diff_s = diff_e = timedelta(seconds=0) # if our filtered files do not encompass the request even when the packet times are updated # try getting 1.5 times the difference of the expected start/end and the start/end of the data insufficient_str = "" if self.filter.start_dt and timing_offsets.adjusted_start > self.filter.start_dt: insufficient_str += f" {self.filter.start_dt} (start)" # diff_s = self.filter.start_dt_buf + 1.5 * (timing_offsets.adjusted_start - self.filter.start_dt) new_end = self.filter.start_dt - self.filter.start_dt_buf new_start = new_end - 1.5 * (timing_offsets.adjusted_start - self.filter.start_dt) new_index = self._apply_filter(io.ReadFilter() .with_start_dt(new_start) .with_end_dt(new_end) .with_extensions(self.filter.extensions) .with_api_versions(self.filter.api_versions) .with_station_ids(set(station_index.summarize().station_ids())) .with_start_dt_buf(diff_s) .with_end_dt_buf(diff_e)) if len(new_index.entries) > 0: station_index.append(new_index.entries) stats.extend(fs.extract_stats(new_index)) if self.filter.end_dt and timing_offsets.adjusted_end < self.filter.end_dt: insufficient_str += f" {self.filter.end_dt} (end)" # diff_e = self.filter.end_dt_buf + 1.5 * (self.filter.end_dt - timing_offsets.adjusted_end) new_start = self.filter.end_dt + self.filter.end_dt_buf new_end = new_start + 1.5 * (self.filter.end_dt - timing_offsets.adjusted_end) new_index = self._apply_filter(io.ReadFilter() .with_start_dt(new_start) .with_end_dt(new_end) .with_extensions(self.filter.extensions) .with_api_versions(self.filter.api_versions) .with_station_ids(set(station_index.summarize().station_ids())) .with_start_dt_buf(diff_s) .with_end_dt_buf(diff_e)) if len(new_index.entries) > 0: station_index.append(new_index.entries) stats.extend(fs.extract_stats(new_index)) if len(insufficient_str) > 0: self.errors.append(f"Data for {station_index.summarize().station_ids()} exists, " f"but not at:{insufficient_str}") results = {} keys = [] for v, e in enumerate(stats): key = e.app_start_dt if key not in keys: keys.append(key) results[key] = io.Index() results[key].append(entries=[station_index.entries[v]]) return list(results.values()) @staticmethod def read_files_in_index(indexf: io.Index) -> List[api_m.RedvoxPacketM]: """ read all the files in the index :return: list of RedvoxPacketM, converted from API 900 if necessary """ result: List[api_m.RedvoxPacketM] = [] # Iterate over the API 900 packets in a memory efficient way # and convert to API 1000 # noinspection PyTypeChecker for packet_900 in indexf.stream_raw( io.ReadFilter.empty().with_api_versions({io.ApiVersion.API_900}) ): # noinspection Mypy result.append( ac.convert_api_900_to_1000_raw(packet_900) ) # Grab the API 1000 packets # noinspection PyTypeChecker for packet in indexf.stream_raw( io.ReadFilter.empty().with_api_versions({io.ApiVersion.API_1000}) ): # noinspection Mypy result.append(packet) return result # noinspection PyTypeChecker def read_files_by_id(self, station_id: str) -> Optional[List[api_m.RedvoxPacketM]]: """ :param station_id: the id to filter on :return: the list of packets with the requested id, or None if the id can't be found """ result: List[api_m.RedvoxPacketM] = [] # Iterate over the API 900 packets in a memory efficient way # and convert to API 1000 for packet_900 in self._flatten_files_index().stream_raw( io.ReadFilter.empty() .with_api_versions({io.ApiVersion.API_900}) .with_station_ids({station_id}) ): # noinspection Mypy result.append(ac.convert_api_900_to_1000_raw(packet_900)) # Grab the API 1000 packets for packet in self._flatten_files_index().stream_raw( io.ReadFilter.empty() .with_api_versions({io.ApiVersion.API_1000}) .with_station_ids({station_id}) ): # noinspection Mypy result.append(packet) if len(result) == 0: return None return result def _stations_by_index(self, findex: io.Index) -> Station: """ :param findex: index with files to build a station with :return: Station built from files in findex """ return Station(self.read_files_in_index(findex)) def get_stations(self, pool: Optional[multiprocessing.pool.Pool] = None) -> List[Station]: """ :param pool: optional multiprocessing pool :return: List of all stations in the ApiReader """ return list(maybe_parallel_map(pool, self._stations_by_index, self.files_index, chunk_size=1 ) ) def get_station_by_id(self, get_id: str) -> Optional[List[Station]]: """ :param get_id: the id to filter on :return: list of all stations with the requested id or None if id can't be found """ result = [s for s in self.get_stations() if s.id == get_id] if len(result) < 1: return None return result
Static methods
def read_files_in_index(indexf: Index) ‑> List[src.redvox_api_m.redvox_api_m_pb2.RedvoxPacketM]
-
read all the files in the index
:return: list of RedvoxPacketM, converted from API 900 if necessary
Expand source code
@staticmethod def read_files_in_index(indexf: io.Index) -> List[api_m.RedvoxPacketM]: """ read all the files in the index :return: list of RedvoxPacketM, converted from API 900 if necessary """ result: List[api_m.RedvoxPacketM] = [] # Iterate over the API 900 packets in a memory efficient way # and convert to API 1000 # noinspection PyTypeChecker for packet_900 in indexf.stream_raw( io.ReadFilter.empty().with_api_versions({io.ApiVersion.API_900}) ): # noinspection Mypy result.append( ac.convert_api_900_to_1000_raw(packet_900) ) # Grab the API 1000 packets # noinspection PyTypeChecker for packet in indexf.stream_raw( io.ReadFilter.empty().with_api_versions({io.ApiVersion.API_1000}) ): # noinspection Mypy result.append(packet) return result
Methods
def get_station_by_id(self, get_id: str) ‑> Optional[List[Station]]
-
:param get_id: the id to filter on :return: list of all stations with the requested id or None if id can't be found
Expand source code
def get_station_by_id(self, get_id: str) -> Optional[List[Station]]: """ :param get_id: the id to filter on :return: list of all stations with the requested id or None if id can't be found """ result = [s for s in self.get_stations() if s.id == get_id] if len(result) < 1: return None return result
def get_stations(self, pool: Optional[multiprocessing.pool.Pool] = None) ‑> List[Station]
-
:param pool: optional multiprocessing pool :return: List of all stations in the ApiReader
Expand source code
def get_stations(self, pool: Optional[multiprocessing.pool.Pool] = None) -> List[Station]: """ :param pool: optional multiprocessing pool :return: List of all stations in the ApiReader """ return list(maybe_parallel_map(pool, self._stations_by_index, self.files_index, chunk_size=1 ) )
def read_files_by_id(self, station_id: str) ‑> Optional[List[src.redvox_api_m.redvox_api_m_pb2.RedvoxPacketM]]
-
:param station_id: the id to filter on :return: the list of packets with the requested id, or None if the id can't be found
Expand source code
def read_files_by_id(self, station_id: str) -> Optional[List[api_m.RedvoxPacketM]]: """ :param station_id: the id to filter on :return: the list of packets with the requested id, or None if the id can't be found """ result: List[api_m.RedvoxPacketM] = [] # Iterate over the API 900 packets in a memory efficient way # and convert to API 1000 for packet_900 in self._flatten_files_index().stream_raw( io.ReadFilter.empty() .with_api_versions({io.ApiVersion.API_900}) .with_station_ids({station_id}) ): # noinspection Mypy result.append(ac.convert_api_900_to_1000_raw(packet_900)) # Grab the API 1000 packets for packet in self._flatten_files_index().stream_raw( io.ReadFilter.empty() .with_api_versions({io.ApiVersion.API_1000}) .with_station_ids({station_id}) ): # noinspection Mypy result.append(packet) if len(result) == 0: return None return result