import os
import io
import tempfile
import requests
import fsspec
import xarray
import geopandas
import pandas
from urllib.parse import urlparse
import asyncio
from functools import wraps, partial
from ..utils.response import ResponseFile
from .datasource import Datasource
from .catalog import Catalog
from .query import Query
DEFAULT_CONFIG = {"DATAMESH_SERVICE": "https://datamesh.oceanum.io"}
[docs]class DatameshConnectError(Exception):
pass
[docs]class DatameshQueryError(Exception):
pass
[docs]def asyncwrapper(func):
@wraps(func)
async def run(*args, loop=None, executor=None, **kwargs):
if loop is None:
loop = asyncio.get_event_loop()
pfunc = partial(func, *args, **kwargs)
return await loop.run_in_executor(executor, pfunc)
return run
[docs]class Connector(object):
"""Datamesh connector class.
All datamesh operations are methods of this class
"""
[docs] def __init__(
self,
token=None,
service=os.environ.get("DATAMESH_SERVICE", DEFAULT_CONFIG["DATAMESH_SERVICE"]),
gateway=None,
):
"""Datamesh connector constructor
Args:
token (string): Your datamesh access token. Defaults to os.environ.get("DATAMESH_TOKEN", None).
service (string, optional): URL of datamesh service. Defaults to os.environ.get("DATAMESH_SERVICE", "https://datamesh.oceanum.io").
gateway (string, optional): URL of gateway service. Defaults to os.environ.get("DATAMESH_GATEWAY", "https://gateway.<datamesh_service_domain>").
Raises:
ValueError: Missing or invalid arguments
"""
if token is None:
token = os.environ.get("DATAMESH_TOKEN", None)
if token is None:
raise ValueError(
"A valid key must be supplied as a connection constructor argument or defined in environment variables as DATAMESH_TOKEN"
)
self._token = token
url = urlparse(service)
self._proto = url.scheme
self._host = url.hostname
self._auth_headers = {
"Authorization": "Token " + self._token,
"X-DATAMESH-TOKEN": self._token,
}
self._gateway = gateway or f"{self._proto}://gateway.{self._host}"
@property
def host(self):
"""Datamesh host
Returns:
string: Datamesh server host
"""
return self._host
# Check the status of the metadata server
def _status(self):
resp = requests.get(f"{self._proto}://{self._host}", headers=self._auth_headers)
return rest.status_code == 200
def _metadata_request(self, datasource_id=""):
resp = requests.get(
f"{self._proto}://{self._host}/datasource/{datasource_id}",
headers=self._auth_headers,
)
return resp
def _zarr_proxy(self, datasource_id):
try:
mapper = fsspec.get_mapper(
f"{self._gateway}/zarr/{datasource_id}",
headers=self._auth_headers,
)
except Exception as e:
raise DatameshConnectError(str(e))
return mapper
def _data_request(self, datasource_id, data_format="application/json"):
resp = requests.get(
f"{self._gateway}/data/{datasource_id}",
headers={"Accept": data_format, **self._auth_headers},
)
if not resp.status_code == 200:
raise DatameshConnectError(resp.text)
else:
return ResponseFile(resp.content)
def _query_request(self, query, data_format="application/json"):
headers = {"Accept": data_format, **self._auth_headers}
resp = requests.post(
f"{self._gateway}/oceanql/", headers=headers, data=query.json()
)
if not resp.status_code == 200:
raise DatameshQueryError(resp.text)
else:
return resp.content
def _query(self, query):
if not isinstance(query, Query):
query = Query(**query)
ds = self.get_datasource(query.datasource)
transfer_format = (
"application/x-netcdf4"
if ds.container == xarray.Dataset
else "application/parquet"
)
resp = self._query_request(query, data_format=transfer_format)
with tempfile.SpooledTemporaryFile() as f:
f.write(resp)
f.seek(0)
if ds.container == xarray.Dataset:
return xarray.open_dataset(f, engine="h5netcdf").load()
elif ds.container == geopandas.GeoDataFrame:
return geopandas.read_parquet(f)
else:
return pandas.read_parquet(f)
[docs] def get_catalog(self, filter={}):
"""Get datamesh catalog
Args:
filter (dict, optional): Set of filters to apply. Defaults to {}.
Returns:
:obj:`oceanum.datamesh.Catalog`: A datamesh catalog instance
"""
cat = Catalog._init(
self,
)
return cat
[docs] async def get_catalog_async(self, filter={}):
"""Get datamesh catalog asynchronously
Args:
filter (dict, optional): Set of filters to apply. Defaults to {}.
loop: event loop. default=None will use :obj:`asyncio.get_running_loop()`
executor: :obj:`concurrent.futures.Executor` instance. default=None will use the default executor
Returns:
Coroutine<:obj:`oceanum.datamesh.Catalog`>: A datamesh catalog instance
"""
cat = Catalog._init(
self,
)
return cat
[docs] def get_datasource(self, datasource_id):
"""Get a Datasource instance from the datamesh. This does not load the actual data.
Args:
datasource_id (string): Unique datasource id
Returns:
:obj:`oceanum.datamesh.Datasource`: A datasource instance
Raises:
DatameshConnectError: Datasource cannot be found or is not authorized for the datamesh key
"""
return Datasource._init(self, datasource_id)
[docs] @asyncwrapper
def get_datasource_async(self, datasource_id):
"""Get a Datasource instance from the datamesh asynchronously. This does not load the actual data.
Args:
datasource_id (string): Unique datasource id
loop: event loop. default=None will use :obj:`asyncio.get_running_loop()`
executor: :obj:`concurrent.futures.Executor` instance. default=None will use the default executor
Returns:
Coroutine<:obj:`oceanum.datamesh.Datasource`>: A datasource instance
Raises:
DatameshConnectError: Datasource cannot be found or is not authorized for the datamesh key
"""
return Datasource._init(self, datasource_id)
[docs] def load_datasource(self, datasource_id, use_dask=True):
"""Load a datasource into the work environment
Args:
datasource_id (string): Unique datasource id
use_dask (bool, optional): Load datasource as a dask enabled datasource if possible. Defaults to True.
Returns:
Union[:obj:`pandas.DataFrame`, :obj:`geopandas.GeoDataFrame`, :obj:`xarray.Dataset`]: The datasource container
"""
ds = self.get_datasource(datasource_id)
return ds.load()
[docs] @asyncwrapper
def load_datasource_async(self, datasource_id, use_dask=True):
"""Load a datasource asynchronously into the work environment
Args:
datasource_id (string): Unique datasource id
use_dask (bool, optional): Load datasource as a dask enabled datasource if possible. Defaults to True.
loop: event loop. default=None will use :obj:`asyncio.get_running_loop()`
executor: :obj:`concurrent.futures.Executor` instance. default=None will use the default executor
Returns:
coroutine<Union[:obj:`pandas.DataFrame`, :obj:`geopandas.GeoDataFrame`, :obj:`xarray.Dataset`]>: The datasource container
"""
ds = self.get_datasource(datasource_id)
return ds.load()
[docs] def query(self, query):
"""Make a datamesh query
Args:
query (Union[:obj:`oceanum.datamesh.Query`, dict]): Datamesh query as a query object or a valid query dictionary
Returns:
Union[:obj:`pandas.DataFrame`, :obj:`geopandas.GeoDataFrame`, :obj:`xarray.Dataset`]: The datasource container
"""
return self._query(query)
[docs] @asyncwrapper
async def query_async(self, query):
"""Make a datamesh query asynchronously
Args:
query (Union[:obj:`oceanum.datamesh.Query`, dict]): Datamesh query as a query object or a valid query dictionary
loop: event loop. default=None will use :obj:`asyncio.get_running_loop()`
executor: :obj:`concurrent.futures.Executor` instance. default=None will use the default executor
Returns:
Coroutine<Union[:obj:`pandas.DataFrame`, :obj:`geopandas.GeoDataFrame`, :obj:`xarray.Dataset`]>: The datasource container
"""
return self._query(query)