muutils.jsonlines
utilities for reading and writing jsonlines files, including gzip support
1"utilities for reading and writing jsonlines files, including gzip support" 2 3from __future__ import annotations 4 5import gzip 6import json 7from typing import Callable, Sequence 8 9from muutils.json_serialize import JSONitem 10 11_GZIP_EXTENSIONS: tuple = (".gz", ".gzip") 12 13 14def _file_is_gzip(path: str) -> bool: 15 return any(str(path).endswith(ext) for ext in _GZIP_EXTENSIONS) 16 17 18def _get_opener( 19 path: str, 20 use_gzip: bool | None = None, 21) -> Callable: 22 if use_gzip is None: 23 use_gzip = _file_is_gzip(path) 24 25 # appears to be another mypy bug 26 # https://github.com/python/mypy/issues/10740 27 return open if not use_gzip else gzip.open # type: ignore 28 29 30def jsonl_load( 31 path: str, 32 /, 33 *, 34 use_gzip: bool | None = None, 35) -> list[JSONitem]: 36 opener: Callable = _get_opener(path, use_gzip) 37 38 data: list[JSONitem] = list() 39 with opener(path, "rt", encoding="UTF-8") as f: 40 for line in f: 41 data.append(json.loads(line)) 42 43 return data 44 45 46def jsonl_load_log( 47 path: str, 48 /, 49 *, 50 use_gzip: bool | None = None, 51) -> list[dict]: 52 data: list[JSONitem] = jsonl_load(path, use_gzip=use_gzip) 53 for idx, item in enumerate(data): 54 assert isinstance( 55 item, dict 56 ), f"item {idx = } from file {path} is not a dict: {type(item) = }\t{item = }" 57 58 # mypy complains that we are returning a list[JSONitem] but the function signature says list[dict] 59 # it can't figure out that we are asserting that all items are dicts 60 return data # type: ignore 61 62 63def jsonl_write( 64 path: str, 65 items: Sequence[JSONitem], 66 use_gzip: bool | None = None, 67 gzip_compresslevel: int = 2, 68) -> None: 69 opener: Callable = _get_opener(path, use_gzip) 70 71 opener_kwargs: dict = dict() 72 if use_gzip: 73 opener_kwargs = dict(compresslevel=gzip_compresslevel) 74 75 with opener(path, "wt", encoding="UTF-8", **opener_kwargs) as f: 76 for item in items: 77 f.write(json.dumps(item) + "\n")
def
jsonl_load( path: str, /, *, use_gzip: bool | None = None) -> list[typing.Union[bool, int, float, str, NoneType, typing.List[typing.Union[bool, int, float, str, NoneType, typing.List[typing.Any], typing.Dict[str, typing.Any]]], typing.Dict[str, typing.Union[bool, int, float, str, NoneType, typing.List[typing.Any], typing.Dict[str, typing.Any]]]]]:
31def jsonl_load( 32 path: str, 33 /, 34 *, 35 use_gzip: bool | None = None, 36) -> list[JSONitem]: 37 opener: Callable = _get_opener(path, use_gzip) 38 39 data: list[JSONitem] = list() 40 with opener(path, "rt", encoding="UTF-8") as f: 41 for line in f: 42 data.append(json.loads(line)) 43 44 return data
def
jsonl_load_log(path: str, /, *, use_gzip: bool | None = None) -> list[dict]:
47def jsonl_load_log( 48 path: str, 49 /, 50 *, 51 use_gzip: bool | None = None, 52) -> list[dict]: 53 data: list[JSONitem] = jsonl_load(path, use_gzip=use_gzip) 54 for idx, item in enumerate(data): 55 assert isinstance( 56 item, dict 57 ), f"item {idx = } from file {path} is not a dict: {type(item) = }\t{item = }" 58 59 # mypy complains that we are returning a list[JSONitem] but the function signature says list[dict] 60 # it can't figure out that we are asserting that all items are dicts 61 return data # type: ignore
def
jsonl_write( path: str, items: Sequence[Union[bool, int, float, str, NoneType, List[Union[bool, int, float, str, NoneType, List[Any], Dict[str, Any]]], Dict[str, Union[bool, int, float, str, NoneType, List[Any], Dict[str, Any]]]]], use_gzip: bool | None = None, gzip_compresslevel: int = 2) -> None:
64def jsonl_write( 65 path: str, 66 items: Sequence[JSONitem], 67 use_gzip: bool | None = None, 68 gzip_compresslevel: int = 2, 69) -> None: 70 opener: Callable = _get_opener(path, use_gzip) 71 72 opener_kwargs: dict = dict() 73 if use_gzip: 74 opener_kwargs = dict(compresslevel=gzip_compresslevel) 75 76 with opener(path, "wt", encoding="UTF-8", **opener_kwargs) as f: 77 for item in items: 78 f.write(json.dumps(item) + "\n")