Advanced Read Operations¶
ParquetDB offers a read
method that enables a range of advanced data retrieval options. These options include:
Filtering data by column values (predicate pushdown),
Selecting or excluding specific columns,
Batch processing for optimized memory usage,
Rebuilding nested structures for hierarchical data, and more.
In this notebook, we will explore how to leverage advanced read capabilities to efficiently retrieve data that meets your exact requirements.
The read
method has the following signature:
def read(
self,
ids: List[int] = None,
columns: List[str] = None,
filters: List[pc.Expression] = None,
load_format: str = "table",
batch_size: int = None,
include_cols: bool = True,
rebuild_nested_struct: bool = False,
rebuild_nested_from_scratch: bool = False,
load_config: LoadConfig = LoadConfig(),
normalize_config: NormalizeConfig = NormalizeConfig(),
) -> Union[pa.Table, Generator, Any]:
...
[1]:
import pprint
import time
import os
import shutil
import pyarrow as pa
from parquetdb import ParquetDB, LoadConfig, NormalizeConfig
from parquetdb.utils.general_utils import generate_similar_data
# Define a simple template data entry
template_dict = {
"float_field": 10,
"int_field": 10,
"name": "item",
"nested_value": {"value": 10, "name": "item"},
"list_field": [1, 2, 3],
}
for x in range(500):
template_dict[f"column_{x}"] = "test"
template = [template_dict]
# Generate multiple data entries
num_entries = 100000 # Feel free to adjust this
data = generate_similar_data(template, num_entries)
print("Generated Data:")
pprint.pprint(data[0])
db_path = "ParquetDB"
if os.path.exists(db_path):
shutil.rmtree(db_path)
db = ParquetDB(db_path=db_path)
db.create(data)
print(db)
data = None
Generated Data:
{'column_0': 'test_64',
'column_1': 'test_83',
'column_10': 'test_71',
'column_100': 'test_68',
'column_101': 'test_65',
'column_102': 'test_67',
'column_103': 'test_60',
'column_104': 'test_73',
'column_105': 'test_10',
'column_106': 'test_74',
'column_107': 'test_73',
'column_108': 'test_60',
'column_109': 'test_58',
'column_11': 'test_39',
'column_110': 'test_100',
'column_111': 'test_18',
'column_112': 'test_42',
'column_113': 'test_62',
'column_114': 'test_58',
'column_115': 'test_16',
'column_116': 'test_84',
'column_117': 'test_32',
'column_118': 'test_11',
'column_119': 'test_28',
'column_12': 'test_73',
'column_120': 'test_49',
'column_121': 'test_12',
'column_122': 'test_22',
'column_123': 'test_52',
'column_124': 'test_37',
'column_125': 'test_1',
'column_126': 'test_47',
'column_127': 'test_37',
'column_128': 'test_12',
'column_129': 'test_15',
'column_13': 'test_36',
'column_130': 'test_35',
'column_131': 'test_54',
'column_132': 'test_34',
'column_133': 'test_94',
'column_134': 'test_65',
'column_135': 'test_42',
'column_136': 'test_10',
'column_137': 'test_57',
'column_138': 'test_68',
'column_139': 'test_31',
'column_14': 'test_24',
'column_140': 'test_4',
'column_141': 'test_34',
'column_142': 'test_36',
'column_143': 'test_73',
'column_144': 'test_2',
'column_145': 'test_12',
'column_146': 'test_22',
'column_147': 'test_5',
'column_148': 'test_74',
'column_149': 'test_78',
'column_15': 'test_100',
'column_150': 'test_60',
'column_151': 'test_1',
'column_152': 'test_75',
'column_153': 'test_48',
'column_154': 'test_52',
'column_155': 'test_40',
'column_156': 'test_14',
'column_157': 'test_66',
'column_158': 'test_65',
'column_159': 'test_76',
'column_16': 'test_37',
'column_160': 'test_52',
'column_161': 'test_8',
'column_162': 'test_41',
'column_163': 'test_76',
'column_164': 'test_22',
'column_165': 'test_92',
'column_166': 'test_70',
'column_167': 'test_67',
'column_168': 'test_67',
'column_169': 'test_74',
'column_17': 'test_30',
'column_170': 'test_86',
'column_171': 'test_5',
'column_172': 'test_36',
'column_173': 'test_56',
'column_174': 'test_32',
'column_175': 'test_25',
'column_176': 'test_29',
'column_177': 'test_78',
'column_178': 'test_91',
'column_179': 'test_75',
'column_18': 'test_29',
'column_180': 'test_30',
'column_181': 'test_15',
'column_182': 'test_21',
'column_183': 'test_37',
'column_184': 'test_57',
'column_185': 'test_63',
'column_186': 'test_82',
'column_187': 'test_49',
'column_188': 'test_60',
'column_189': 'test_32',
'column_19': 'test_66',
'column_190': 'test_69',
'column_191': 'test_56',
'column_192': 'test_77',
'column_193': 'test_25',
'column_194': 'test_4',
'column_195': 'test_24',
'column_196': 'test_89',
'column_197': 'test_29',
'column_198': 'test_10',
'column_199': 'test_75',
'column_2': 'test_51',
'column_20': 'test_44',
'column_200': 'test_76',
'column_201': 'test_1',
'column_202': 'test_7',
'column_203': 'test_35',
'column_204': 'test_33',
'column_205': 'test_91',
'column_206': 'test_83',
'column_207': 'test_3',
'column_208': 'test_91',
'column_209': 'test_63',
'column_21': 'test_9',
'column_210': 'test_3',
'column_211': 'test_45',
'column_212': 'test_88',
'column_213': 'test_4',
'column_214': 'test_90',
'column_215': 'test_66',
'column_216': 'test_16',
'column_217': 'test_37',
'column_218': 'test_84',
'column_219': 'test_82',
'column_22': 'test_2',
'column_220': 'test_96',
'column_221': 'test_3',
'column_222': 'test_4',
'column_223': 'test_28',
'column_224': 'test_79',
'column_225': 'test_50',
'column_226': 'test_67',
'column_227': 'test_23',
'column_228': 'test_96',
'column_229': 'test_79',
'column_23': 'test_24',
'column_230': 'test_26',
'column_231': 'test_61',
'column_232': 'test_82',
'column_233': 'test_38',
'column_234': 'test_6',
'column_235': 'test_33',
'column_236': 'test_16',
'column_237': 'test_92',
'column_238': 'test_1',
'column_239': 'test_6',
'column_24': 'test_19',
'column_240': 'test_86',
'column_241': 'test_28',
'column_242': 'test_38',
'column_243': 'test_82',
'column_244': 'test_8',
'column_245': 'test_41',
'column_246': 'test_34',
'column_247': 'test_11',
'column_248': 'test_13',
'column_249': 'test_36',
'column_25': 'test_36',
'column_250': 'test_14',
'column_251': 'test_28',
'column_252': 'test_87',
'column_253': 'test_70',
'column_254': 'test_28',
'column_255': 'test_4',
'column_256': 'test_68',
'column_257': 'test_38',
'column_258': 'test_60',
'column_259': 'test_95',
'column_26': 'test_92',
'column_260': 'test_83',
'column_261': 'test_22',
'column_262': 'test_22',
'column_263': 'test_26',
'column_264': 'test_87',
'column_265': 'test_88',
'column_266': 'test_60',
'column_267': 'test_96',
'column_268': 'test_73',
'column_269': 'test_79',
'column_27': 'test_6',
'column_270': 'test_13',
'column_271': 'test_63',
'column_272': 'test_14',
'column_273': 'test_44',
'column_274': 'test_5',
'column_275': 'test_80',
'column_276': 'test_89',
'column_277': 'test_52',
'column_278': 'test_8',
'column_279': 'test_82',
'column_28': 'test_88',
'column_280': 'test_86',
'column_281': 'test_22',
'column_282': 'test_90',
'column_283': 'test_94',
'column_284': 'test_9',
'column_285': 'test_15',
'column_286': 'test_96',
'column_287': 'test_65',
'column_288': 'test_91',
'column_289': 'test_82',
'column_29': 'test_69',
'column_290': 'test_60',
'column_291': 'test_3',
'column_292': 'test_16',
'column_293': 'test_11',
'column_294': 'test_93',
'column_295': 'test_91',
'column_296': 'test_83',
'column_297': 'test_80',
'column_298': 'test_68',
'column_299': 'test_88',
'column_3': 'test_22',
'column_30': 'test_75',
'column_300': 'test_75',
'column_301': 'test_63',
'column_302': 'test_7',
'column_303': 'test_97',
'column_304': 'test_75',
'column_305': 'test_35',
'column_306': 'test_20',
'column_307': 'test_72',
'column_308': 'test_29',
'column_309': 'test_43',
'column_31': 'test_57',
'column_310': 'test_33',
'column_311': 'test_36',
'column_312': 'test_44',
'column_313': 'test_70',
'column_314': 'test_35',
'column_315': 'test_10',
'column_316': 'test_38',
'column_317': 'test_71',
'column_318': 'test_15',
'column_319': 'test_95',
'column_32': 'test_90',
'column_320': 'test_53',
'column_321': 'test_52',
'column_322': 'test_80',
'column_323': 'test_92',
'column_324': 'test_25',
'column_325': 'test_96',
'column_326': 'test_76',
'column_327': 'test_30',
'column_328': 'test_72',
'column_329': 'test_68',
'column_33': 'test_69',
'column_330': 'test_26',
'column_331': 'test_39',
'column_332': 'test_86',
'column_333': 'test_99',
'column_334': 'test_15',
'column_335': 'test_48',
'column_336': 'test_91',
'column_337': 'test_4',
'column_338': 'test_64',
'column_339': 'test_32',
'column_34': 'test_74',
'column_340': 'test_20',
'column_341': 'test_37',
'column_342': 'test_71',
'column_343': 'test_70',
'column_344': 'test_65',
'column_345': 'test_78',
'column_346': 'test_3',
'column_347': 'test_80',
'column_348': 'test_94',
'column_349': 'test_25',
'column_35': 'test_93',
'column_350': 'test_40',
'column_351': 'test_3',
'column_352': 'test_68',
'column_353': 'test_76',
'column_354': 'test_39',
'column_355': 'test_60',
'column_356': 'test_2',
'column_357': 'test_19',
'column_358': 'test_90',
'column_359': 'test_65',
'column_36': 'test_36',
'column_360': 'test_53',
'column_361': 'test_41',
'column_362': 'test_67',
'column_363': 'test_13',
'column_364': 'test_39',
'column_365': 'test_66',
'column_366': 'test_43',
'column_367': 'test_36',
'column_368': 'test_22',
'column_369': 'test_36',
'column_37': 'test_96',
'column_370': 'test_66',
'column_371': 'test_100',
'column_372': 'test_96',
'column_373': 'test_21',
'column_374': 'test_19',
'column_375': 'test_32',
'column_376': 'test_37',
'column_377': 'test_34',
'column_378': 'test_5',
'column_379': 'test_13',
'column_38': 'test_49',
'column_380': 'test_39',
'column_381': 'test_20',
'column_382': 'test_98',
'column_383': 'test_69',
'column_384': 'test_13',
'column_385': 'test_55',
'column_386': 'test_37',
'column_387': 'test_77',
'column_388': 'test_20',
'column_389': 'test_31',
'column_39': 'test_76',
'column_390': 'test_16',
'column_391': 'test_20',
'column_392': 'test_43',
'column_393': 'test_81',
'column_394': 'test_24',
'column_395': 'test_63',
'column_396': 'test_35',
'column_397': 'test_94',
'column_398': 'test_63',
'column_399': 'test_91',
'column_4': 'test_6',
'column_40': 'test_66',
'column_400': 'test_87',
'column_401': 'test_32',
'column_402': 'test_71',
'column_403': 'test_64',
'column_404': 'test_93',
'column_405': 'test_52',
'column_406': 'test_84',
'column_407': 'test_43',
'column_408': 'test_63',
'column_409': 'test_94',
'column_41': 'test_14',
'column_410': 'test_64',
'column_411': 'test_32',
'column_412': 'test_85',
'column_413': 'test_26',
'column_414': 'test_22',
'column_415': 'test_47',
'column_416': 'test_65',
'column_417': 'test_85',
'column_418': 'test_65',
'column_419': 'test_18',
'column_42': 'test_70',
'column_420': 'test_41',
'column_421': 'test_3',
'column_422': 'test_28',
'column_423': 'test_38',
'column_424': 'test_30',
'column_425': 'test_69',
'column_426': 'test_52',
'column_427': 'test_86',
'column_428': 'test_95',
'column_429': 'test_16',
'column_43': 'test_43',
'column_430': 'test_89',
'column_431': 'test_37',
'column_432': 'test_10',
'column_433': 'test_62',
'column_434': 'test_12',
'column_435': 'test_30',
'column_436': 'test_49',
'column_437': 'test_54',
'column_438': 'test_34',
'column_439': 'test_47',
'column_44': 'test_75',
'column_440': 'test_74',
'column_441': 'test_58',
'column_442': 'test_75',
'column_443': 'test_98',
'column_444': 'test_53',
'column_445': 'test_27',
'column_446': 'test_70',
'column_447': 'test_64',
'column_448': 'test_48',
'column_449': 'test_85',
'column_45': 'test_89',
'column_450': 'test_40',
'column_451': 'test_94',
'column_452': 'test_51',
'column_453': 'test_14',
'column_454': 'test_63',
'column_455': 'test_63',
'column_456': 'test_72',
'column_457': 'test_85',
'column_458': 'test_20',
'column_459': 'test_88',
'column_46': 'test_50',
'column_460': 'test_86',
'column_461': 'test_43',
'column_462': 'test_60',
'column_463': 'test_29',
'column_464': 'test_28',
'column_465': 'test_47',
'column_466': 'test_52',
'column_467': 'test_75',
'column_468': 'test_40',
'column_469': 'test_18',
'column_47': 'test_30',
'column_470': 'test_83',
'column_471': 'test_38',
'column_472': 'test_62',
'column_473': 'test_43',
'column_474': 'test_50',
'column_475': 'test_26',
'column_476': 'test_6',
'column_477': 'test_75',
'column_478': 'test_76',
'column_479': 'test_81',
'column_48': 'test_25',
'column_480': 'test_58',
'column_481': 'test_81',
'column_482': 'test_37',
'column_483': 'test_29',
'column_484': 'test_18',
'column_485': 'test_69',
'column_486': 'test_88',
'column_487': 'test_43',
'column_488': 'test_90',
'column_489': 'test_51',
'column_49': 'test_3',
'column_490': 'test_55',
'column_491': 'test_70',
'column_492': 'test_9',
'column_493': 'test_1',
'column_494': 'test_64',
'column_495': 'test_31',
'column_496': 'test_64',
'column_497': 'test_71',
'column_498': 'test_22',
'column_499': 'test_3',
'column_5': 'test_81',
'column_50': 'test_76',
'column_51': 'test_80',
'column_52': 'test_19',
'column_53': 'test_77',
'column_54': 'test_16',
'column_55': 'test_78',
'column_56': 'test_100',
'column_57': 'test_10',
'column_58': 'test_54',
'column_59': 'test_49',
'column_6': 'test_56',
'column_60': 'test_45',
'column_61': 'test_7',
'column_62': 'test_65',
'column_63': 'test_92',
'column_64': 'test_31',
'column_65': 'test_44',
'column_66': 'test_90',
'column_67': 'test_12',
'column_68': 'test_21',
'column_69': 'test_83',
'column_7': 'test_24',
'column_70': 'test_41',
'column_71': 'test_83',
'column_72': 'test_75',
'column_73': 'test_86',
'column_74': 'test_46',
'column_75': 'test_99',
'column_76': 'test_13',
'column_77': 'test_37',
'column_78': 'test_94',
'column_79': 'test_8',
'column_8': 'test_29',
'column_80': 'test_7',
'column_81': 'test_54',
'column_82': 'test_94',
'column_83': 'test_85',
'column_84': 'test_98',
'column_85': 'test_87',
'column_86': 'test_44',
'column_87': 'test_33',
'column_88': 'test_44',
'column_89': 'test_95',
'column_9': 'test_88',
'column_90': 'test_23',
'column_91': 'test_36',
'column_92': 'test_80',
'column_93': 'test_1',
'column_94': 'test_78',
'column_95': 'test_67',
'column_96': 'test_20',
'column_97': 'test_43',
'column_98': 'test_49',
'column_99': 'test_85',
'float_field': 11,
'int_field': 6,
'list_field': [1, 2, 3],
'name': 'item_22',
'nested_value': {'name': 'item_13', 'value': 16}}
============================================================
PARQUETDB SUMMARY
============================================================
Database path: ParquetDB
• Number of columns: 507
• Number of rows: 100000
• Number of files: 1
• Number of rows per file: [100000]
• Number of row groups per file: [4]
• Serialized metadata size per file: [225147] Bytes
############################################################
METADATA
############################################################
############################################################
COLUMN DETAILS
############################################################
• Columns:
- column_8
- column_191
- column_195
- column_261
- column_118
- column_135
- column_50
- column_156
- column_69
- column_498
- column_196
- column_358
- column_451
- column_49
- column_70
- column_492
- column_493
- column_288
- column_340
- column_327
- column_468
- column_88
- column_193
- column_283
- column_121
- column_409
- column_97
- column_174
- column_160
- column_318
- column_482
- column_291
- column_0
- column_23
- column_270
- column_279
- column_382
- column_230
- column_456
- column_184
- column_259
- column_457
- column_273
- column_296
- column_90
- column_280
- column_323
- column_57
- column_94
- column_439
- column_205
- column_253
- nested_value.value
- column_432
- column_206
- column_368
- column_369
- column_167
- column_117
- column_152
- column_349
- column_39
- column_164
- column_307
- column_257
- column_82
- column_157
- column_194
- column_18
- column_467
- column_290
- column_297
- column_221
- column_294
- column_374
- column_331
- column_225
- column_443
- column_154
- column_59
- column_44
- column_308
- column_491
- column_357
- column_386
- column_423
- column_465
- column_110
- column_306
- column_263
- column_240
- column_401
- column_115
- column_92
- column_223
- column_21
- column_168
- column_412
- column_190
- column_487
- column_43
- column_34
- column_24
- column_298
- column_362
- column_281
- column_309
- column_381
- column_67
- column_17
- column_311
- column_245
- column_106
- column_472
- column_131
- column_391
- column_399
- column_445
- column_435
- column_376
- column_434
- column_344
- column_338
- column_202
- column_5
- column_350
- column_102
- column_217
- column_162
- column_212
- column_418
- column_178
- column_304
- column_353
- column_256
- column_53
- column_239
- column_469
- column_120
- column_12
- column_485
- column_124
- column_60
- column_11
- column_452
- column_9
- column_373
- column_100
- column_98
- column_20
- column_414
- column_47
- column_36
- column_268
- column_464
- column_198
- column_411
- column_436
- column_10
- column_228
- column_254
- column_158
- column_286
- column_182
- column_375
- column_475
- column_185
- column_244
- column_147
- column_260
- column_180
- column_315
- column_470
- id
- column_192
- column_285
- column_138
- column_163
- column_459
- column_287
- column_139
- column_243
- column_7
- column_3
- column_16
- column_177
- column_380
- column_219
- column_483
- column_295
- column_48
- column_32
- column_366
- column_430
- column_40
- column_54
- column_61
- column_330
- column_495
- column_275
- column_425
- column_132
- column_76
- column_200
- column_313
- column_301
- column_246
- column_326
- column_416
- column_19
- column_247
- column_406
- column_119
- column_277
- column_155
- column_30
- column_220
- column_354
- column_75
- column_499
- column_4
- column_89
- column_176
- column_255
- column_312
- column_413
- column_13
- column_395
- column_188
- column_91
- column_130
- float_field
- column_210
- column_461
- column_227
- column_148
- column_490
- column_146
- column_463
- nested_value.name
- column_37
- column_302
- column_410
- column_171
- column_222
- column_77
- column_232
- column_137
- column_359
- column_486
- column_27
- column_101
- column_474
- column_231
- column_62
- column_99
- column_363
- column_179
- column_404
- column_242
- column_329
- column_494
- column_497
- column_52
- column_63
- column_170
- column_25
- column_473
- column_6
- column_317
- column_265
- column_112
- column_437
- column_383
- column_144
- column_103
- column_142
- column_55
- column_258
- column_355
- column_428
- column_233
- column_333
- column_35
- column_324
- column_165
- column_56
- column_211
- column_45
- column_372
- column_397
- column_476
- column_134
- column_364
- column_111
- column_214
- column_293
- column_377
- column_81
- column_235
- column_419
- column_1
- column_421
- column_334
- column_28
- column_378
- column_332
- column_422
- column_189
- column_325
- column_447
- column_360
- column_116
- column_264
- column_237
- column_400
- column_319
- column_371
- column_159
- column_390
- column_384
- column_127
- column_31
- column_161
- column_133
- column_73
- column_209
- column_394
- column_454
- column_173
- column_83
- column_335
- column_278
- column_367
- column_450
- column_108
- column_93
- column_248
- column_208
- column_396
- column_453
- column_345
- column_388
- column_420
- column_356
- column_51
- column_109
- column_348
- column_303
- column_479
- column_274
- column_72
- column_387
- column_481
- column_370
- column_346
- column_305
- column_141
- column_417
- column_339
- column_444
- column_426
- column_462
- column_96
- column_86
- column_408
- column_114
- column_466
- column_65
- column_379
- column_398
- column_365
- column_129
- column_292
- column_402
- column_460
- column_2
- column_74
- column_440
- column_236
- column_80
- column_251
- column_392
- column_427
- column_336
- column_107
- column_15
- column_149
- column_238
- column_442
- column_128
- list_field
- column_458
- column_441
- column_289
- column_446
- column_218
- column_140
- column_215
- column_105
- column_26
- column_203
- column_351
- column_337
- column_424
- column_320
- column_29
- column_341
- column_385
- column_389
- column_85
- column_151
- column_299
- column_321
- column_438
- column_269
- column_14
- column_58
- column_407
- column_347
- column_224
- column_38
- column_496
- column_300
- column_271
- column_489
- column_122
- column_252
- column_207
- column_229
- column_187
- column_113
- column_204
- column_455
- column_316
- column_314
- column_199
- column_284
- column_415
- column_310
- column_478
- column_234
- column_84
- column_342
- column_169
- column_276
- column_183
- int_field
- column_143
- column_449
- column_431
- column_175
- column_181
- column_104
- column_123
- column_343
- column_136
- column_226
- column_471
- column_480
- column_322
- column_197
- column_145
- column_172
- column_125
- column_272
- column_328
- column_64
- column_429
- column_78
- column_126
- column_42
- column_262
- column_484
- column_46
- column_150
- column_79
- column_352
- column_241
- column_477
- column_33
- column_87
- column_201
- column_249
- column_216
- column_213
- column_71
- column_405
- column_448
- column_433
- column_361
- column_153
- column_267
- column_250
- column_166
- column_68
- column_66
- column_95
- column_488
- column_186
- column_393
- column_282
- name
- column_266
- column_22
- column_41
- column_403
Basic Read¶
A straightforward read operation without any parameters returns all data in a pyarrow.Table
format (default).
[2]:
start_time = time.time()
data = db.read()
print(data.shape)
end_time = time.time()
print(f"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB")
print(f"Time taken to read all data: {end_time - start_time} seconds")
data = None
(100000, 507)
Total memory allocated: 572.2216796875 MB
Time taken to read all data: 0.3734889030456543 seconds
Column Selection¶
Read Specific Columns¶
To retrieve specific columns, you can pass a list of column names to the columns
parameter.
[3]:
start_time = time.time()
data = db.read(columns=["int_field"])
print(data.shape)
end_time = time.time()
print(f"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB")
print(f"Time taken to read all data: {end_time - start_time} seconds")
data = None
(100000, 1)
Total memory allocated: 0.77490234375 MB
Time taken to read all data: 0.05100083351135254 seconds
Read Nested Columns¶
As you can see, using column selection it is much faster and less memory intensive as only the specified columns are loaded into memory.
ParquetDB by defualts flattens nested structures. If you want the read fields from the nested structure you can specify it by using the syntax nested_value.field_name
.
[4]:
start_time = time.time()
data = db.read(columns=["nested_value.name"])
print(data.shape)
end_time = time.time()
print(f"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB")
print(f"Time taken to read all data: {end_time - start_time} seconds")
data = None
(100000, 1)
Total memory allocated: 1.12738037109375 MB
Time taken to read all data: 0.046912431716918945 seconds
Do not read selected columns¶
If you want to read all columns but not include the selected columns you can set the include_cols
parameter to False
.
[5]:
start_time = time.time()
data = db.read(columns=["int_field"], include_cols=False)
print(data.shape)
end_time = time.time()
column_names = data.column_names
assert "int_field" not in column_names
print(f"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB")
print(f"Time taken to read all data: {end_time - start_time} seconds")
data = None
(100000, 506)
Total memory allocated: 571.6526489257812 MB
Time taken to read all data: 0.35604023933410645 seconds
Filtering Data¶
You can filter data by using the filters
parameter. This parameter accepts a list of pyarrow expressions. ParquetDB will combine the filters using the and
operator.
For more information on how to create pyarrow expressions, please refer to the pyarrow documentation.
[6]:
import pyarrow.compute as pc
start_time = time.time()
data = db.read(filters=[pc.field("int_field") < 10])
print(data.shape)
end_time = time.time()
print(f"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB")
print(f"Time taken to read all data: {end_time - start_time} seconds")
data = None
(47684, 507)
Total memory allocated: 425.08599853515625 MB
Time taken to read all data: 0.4452495574951172 seconds
Again, since we do not read all the rows, the memory usage is much lower and the time taken is also faster.
You can can still choose which columns to read while filtering.
[7]:
start_time = time.time()
data = db.read(columns=["nested_value.name"], filters=[pc.field("int_field") < 10])
print(data.shape)
end_time = time.time()
print(f"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB")
print(f"Time taken to read all data: {end_time - start_time} seconds")
data = None
(47684, 1)
Total memory allocated: 1.09210205078125 MB
Time taken to read all data: 0.03300309181213379 seconds
Rebuilding Nested Structures¶
ParquetDB by default flattens nested structures. If you want to read the nested structure to read the nested data in its original form, you can set the rebuild_nested_struct
parameter to True
. This will create a new files that will contain the nested data.
Note: direct updates, creates, or deletes to the nested data will not be reflected in the original files. You will have to rebuild the nested structure from scratch. by using the
rebuild_nested_from_scratch
parameter.
[10]:
start_time = time.time()
data = db.read(rebuild_nested_struct=True)
print(data.shape)
column_names = data.column_names
assert "nested_value.name" not in column_names
assert "nested_value.value" not in column_names
assert "nested_value" in column_names
end_time = time.time()
print(f"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB")
print(f"Time taken to read all data: {end_time - start_time} seconds")
data = None
(100000, 506)
Total memory allocated: 528.3989868164062 MB
Time taken to read all data: 0.29286813735961914 seconds
Since the data is in nested format, to read specific nested fields slightly change. Now you can directly call the parent field and it will return all children fields.
[13]:
start_time = time.time()
data = db.read(columns=["nested_value"], rebuild_nested_struct=True).to_pandas()
print(data.shape)
column_names = data.columns
print(data.head())
assert "nested_value" in column_names
end_time = time.time()
print(f"Time taken to read all data: {end_time - start_time} seconds")
data = None
(100000, 1)
nested_value
0 {'name': 'item_13', 'value': 16}
1 {'name': 'item_83', 'value': 13}
2 {'name': 'item_27', 'value': 14}
3 {'name': 'item_20', 'value': 20}
4 {'name': 'item_87', 'value': 12}
Time taken to read all data: 0.06846165657043457 seconds
You can still specify the child field in the previous form
[14]:
start_time = time.time()
data = db.read(columns=["nested_value.name"], rebuild_nested_struct=True).to_pandas()
print(data.shape)
column_names = data.columns
print(data.head())
end_time = time.time()
print(f"Time taken to read all data: {end_time - start_time} seconds")
data = None
(100000, 1)
name
0 item_13
1 item_83
2 item_27
3 item_20
4 item_87
Time taken to read all data: 0.03645682334899902 seconds
When the data is in nested format, creating pyarrow expressions for filtering is a bit different
[15]:
start_time = time.time()
data = db.read(
columns=["nested_value.name"],
filters=[pc.field("nested_value", "name") == "item_20"],
rebuild_nested_struct=True,
).to_pandas()
print(data.shape)
column_names = data.columns
print(data.head())
end_time = time.time()
print(f"Time taken to read all data: {end_time - start_time} seconds")
data = None
(1009, 1)
name
0 item_20
1 item_20
2 item_20
3 item_20
4 item_20
Time taken to read all data: 0.04416155815124512 seconds
Batch Processing¶
ParquetDB also supports batch processing. This is useful when you want to process data in smaller chunks to avoid memory issues. This can be done by setting the load_format="batches"
and setting the batch_size
parameter.
When this is used, the read
method will return a generator that yields pyarrow.RecordBatch
objects, which are similar to pyarrow.Table
objects.
[19]:
start_time = time.time()
generator = db.read(load_format="batches", batch_size=10000)
print(generator)
end_time = time.time()
print(f"Total memory allocated: {pa.total_allocated_bytes()/1024/1024} MB")
print(f"Time taken to read all data: {end_time - start_time} seconds")
<_cython_3_0_10.generator object at 0x000002BC5A011CC0>
Total memory allocated: 392.501708984375 MB
Time taken to read all data: 0.019051790237426758 seconds
Now, you can iterate over the generator to get the data in batches.
[20]:
for record_batch in generator:
print(record_batch.shape)
(10000, 507)
(10000, 507)
(10000, 507)
(2768, 507)
(10000, 507)
(10000, 507)
(10000, 507)
(2768, 507)
(10000, 507)
(10000, 507)
(10000, 507)
(2768, 507)
(1696, 507)
(10000, 507)
(10000, 507)
(10000, 507)
(2768, 507)
(10000, 507)
(10000, 507)
(10000, 507)
(2768, 507)
(10000, 507)
(10000, 507)
(10000, 507)
(2768, 507)
(1696, 507)
More configurations can be used to control the batching process by adding the load_config
parameter which is an instance of LoadConfig
.
@dataclass
class LoadConfig:
"""
Configuration for loading data, specifying columns, filters, batch size, and memory usage.
Parameters
----------
batch_size : int, optional
The number of rows to process in each batch (default: 131_072).
batch_readahead : int, optional
The number of batches to read ahead in a file (default: 16).
fragment_readahead : int, optional
The number of files to read ahead, improving IO utilization at the cost of RAM usage (default: 4).
fragment_scan_options : Optional[pa.dataset.FragmentScanOptions], optional
Options specific to a particular scan and fragment type, potentially changing across scans.
use_threads : bool, optional
Whether to use maximum parallelism determined by available CPU cores (default: True).
memory_pool : Optional[pa.MemoryPool], optional
The memory pool for allocations. Defaults to the system's default memory pool.
"""
batch_size: int = 131_072
batch_readahead: int = 16
fragment_readahead: int = 4
fragment_scan_options: Optional[pa.dataset.FragmentScanOptions] = None
use_threads: bool = True
memory_pool: Optional[pa.MemoryPool] = None
[27]:
load_config = LoadConfig(batch_size=10000, batch_readahead=1)
generator = db.read(load_format="batches", load_config=load_config)
start_time = time.time()
for record_batch in generator:
batch = record_batch.shape
end_time = time.time()
print(
f"Time taken to read all data when batch_readahead is 1: {end_time - start_time} seconds"
)
load_config = LoadConfig(batch_size=10000, batch_readahead=16)
generator = db.read(load_format="batches", load_config=load_config)
start_time = time.time()
for record_batch in generator:
batch = record_batch.shape
end_time = time.time()
print(
f"Time taken to read all data when batch_readahead is 4: {end_time - start_time} seconds"
)
Time taken to read all data when batch_readahead is 1: 0.6078670024871826 seconds
Time taken to read all data when batch_readahead is 4: 0.5378696918487549 seconds
This difference is small because this dataset is small, but it can be more noticeable for larger datasets.