3D Alexandria Database

alexandria_3d

[ ]:
!pip install parquetdb
!pip install pymatgen
[1]:
import json
import os
import logging
from glob import glob
import shutil
import time

import numpy as np
from pyarrow import compute as pc

from parquetdb.core.parquetdb import LoadConfig, NormalizeConfig
from parquetdb.utils.general_utils import timeit
from parquetdb import ParquetDB, config
from parquetdb.utils.external_utils import download_alexandria_3d_database

import matplotlib.pyplot as plt

Setup

Setup data directories

[2]:
base_dir = os.path.join(config.data_dir, "external", "alexandria", "AlexandriaDB")
benchmark_dir = os.path.join(config.data_dir, "benchmarks", "alexandria")

Download the database

Lets download the database

[3]:
def download_alexandria_database(base_dir, from_scratch=False):
    print("Starting task: download_alexandria_database")
    if from_scratch and os.path.exists(base_dir):
        print(f"Removing existing directory: {base_dir}")
        shutil.rmtree(base_dir, ignore_errors=True)

    # Here we download the database and save it to the data directory
    output_dir = os.path.join(config.data_dir, "external", "alexandria")
    alexandria_dir = download_alexandria_3d_database(output_dir, n_cores=8)
    print("Done with task: download_alexandria_database")
    print("-" * 200)
    return alexandria_dir


alexandria_dir = download_alexandria_database(base_dir, from_scratch=False)
print(alexandria_dir)
Starting task: download_alexandria_database
Database downloaded already. Skipping download.
Done with task: download_alexandria_database
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Z:\data\parquetdb\data\external\alexandria\uncompressed

Creating the database

[4]:
db = ParquetDB(db_path=os.path.join(base_dir, "alexandria_3D"))
print(db)
[INFO] 2025-04-22 09:14:49 - parquetdb.core.parquetdb[205][__init__] - Initializing ParquetDB with db_path: Z:\data\parquetdb\data\external\alexandria\AlexandriaDB\alexandria_3D
[INFO] 2025-04-22 09:14:49 - parquetdb.core.parquetdb[207][__init__] - verbose: 1
============================================================
PARQUETDB SUMMARY
============================================================
Database path: Z:\data\parquetdb\data\external\alexandria\AlexandriaDB\alexandria_3D

• Number of columns: 128
• Number of rows: 4389295
• Number of files: 1
• Number of rows per file: [4389295]
• Number of row groups per file: [22]
• Serialized metadata size per file: [377883] Bytes

############################################################
METADATA
############################################################

############################################################
COLUMN DETAILS
############################################################

Lets Define some dictionaries to store some benchmark results

[5]:
benchmark_dict = {
    "create_times": [],
    "json_load_times": [],
    "n_rows_per_file": [],
}

task_benchmark_dict = {"task_names": [], "task_times": []}

Inputing the data

Here I am just iterating over the json files and creating the database. I am also storing how long it takes to load each json file, how many materials, and how long it takes to input the data into the database.

This dataset is rather large, so you may have to choose the normalization parameters depending on how much RAM you have. If you do not have enough tweak the batch_size, batch_readahead, fragment_readahead parameters.

[6]:
def create_database_if_empty(db, alexandria_dir, normalize_config=NormalizeConfig()):
    """_summary_

    Parameters
    ----------
    db : ParquetDB Instance
        The database instance to create the dataset on.
    alexandria_dir : str
        The directory containing the json files to input into the database.

    Returns
    -------
    _type_
        _description_
    """
    print("Starting task: create_database_if_empty")

    start_time = time.time()

    json_load_times = []
    create_times = []
    n_materials_per_file = []
    if db.is_empty():
        print("The dataset does not exist. Creating it.")
        json_files = glob(os.path.join(alexandria_dir, "*.json"))
        for json_file in json_files[:]:

            start_time = time.time()
            with open(json_file, "r") as f:
                data = json.load(f)
            json_load_time = time.time() - start_time

            base_name = os.path.basename(json_file)
            n_materials = len(data["entries"])
            print(f"Processing file: {base_name}")
            print(f"Number of materials: {n_materials}")
            try:
                # Since we are importing alot of data it is best
                # to normalize the database afterwards
                start_time = time.time()
                db.create(
                    data["entries"],
                    normalize_dataset=False,
                    normalize_config=normalize_config,
                )
                create_time = time.time() - start_time

                create_times.append(create_time)
                n_materials_per_file.append(n_materials)
                json_load_times.append(json_load_time)

            except Exception as e:
                print(e)

            data = None

            print(f"Time taken to create dataset: {time.time() - start_time}")
            print("-" * 100)

    print("Done with task: create_database_if_empty")
    print("-" * 200)
    return json_load_times, create_times, n_materials_per_file


normalize_config = NormalizeConfig(
    load_format="batches",  # Uses the batch generator to normalize
    batch_readahead=4,  # Controls the number of batches to load in memory a head of time. This will have impacts on amount of RAM consumed
    fragment_readahead=2,  # Controls the number of files to load in memory ahead of time. This will have impacts on amount of RAM consumed
    batch_size=100000,  # Controls the batchsize when to use when normalizing. This will have impacts on amount of RAM consumed
    max_rows_per_file=100000,  # Controls the max number of rows per parquet file
    max_rows_per_group=100000,
)  # Controls the max number of rows per group parquet


json_load_times, create_times, n_materials_per_file = create_database_if_empty(
    db, alexandria_dir, normalize_config=normalize_config
)

benchmark_dict["create_times"] = create_times
benchmark_dict["json_load_times"] = json_load_times
benchmark_dict["n_rows_per_file"] = n_materials_per_file
Starting task: create_database_if_empty
Done with task: create_database_if_empty
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Normalizing the dataset

It is best practive to normalize the dataset after all the data has been inputed. This will optimize the performance of the database.

First let’s see how the data is distributed in the row groups.

[7]:
summary = db.summary(show_row_group_metadata=True)
print(summary)
============================================================
PARQUETDB SUMMARY
============================================================
Database path: Z:\data\parquetdb\data\external\alexandria\AlexandriaDB\alexandria_3D

• Number of columns: 128
• Number of rows: 4389295
• Number of files: 1
• Number of rows per file: [4389295]
• Number of row groups per file: [22]
• Number of rows per row group per file:
    - alexandria_3D_0.parquet:
        - Row group 0: 200000 rows
        - Row group 1: 200000 rows
        - Row group 2: 200000 rows
        - Row group 3: 200000 rows
        - Row group 4: 200000 rows
        - Row group 5: 200000 rows
        - Row group 6: 200000 rows
        - Row group 7: 200000 rows
        - Row group 8: 200000 rows
        - Row group 9: 200000 rows
        - Row group 10: 200000 rows
        - Row group 11: 200000 rows
        - Row group 12: 200000 rows
        - Row group 13: 200000 rows
        - Row group 14: 200000 rows
        - Row group 15: 200000 rows
        - Row group 16: 200000 rows
        - Row group 17: 200000 rows
        - Row group 18: 200000 rows
        - Row group 19: 200000 rows
        - Row group 20: 200000 rows
        - Row group 21: 189295 rows
• Serialized metadata size per file: [377883] Bytes

############################################################
METADATA
############################################################

############################################################
COLUMN DETAILS
############################################################

Let’s check the file size of the parquet files.

[8]:
def list_file_sizes(directory, in_MB=True):
    """Lists the size of files in a directory.

    Args:
      directory: The path to the directory.
    """
    file_sizes = {}
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            file_size = os.path.getsize(file_path)
            if in_MB:
                file_size = file_size / (1024 * 1024)
            file_sizes[filename] = file_size
    return file_sizes


file_sizes = list_file_sizes(db.db_path)

for file, size in file_sizes.items():
    print(f"{file}: {size} MB")
alexandria_3D_0.parquet: 2767.1490955352783 MB

Let’s also check the size of the row groups.

[9]:
row_group_metadata_per_file = db.get_parquet_file_row_group_metadata_per_file(
    as_dict=True
)
row_group_size_per_file = {}
sum_row_group_size = 0
num_row_groups = 0
for file, row_group_metadata in row_group_metadata_per_file.items():
    print(f"{file}")
    row_group_size_per_file[file] = {}
    for row_group, metadata in row_group_metadata.items():
        row_group_size_per_file[file][row_group] = metadata["total_byte_size"] / (
            1024 * 1024
        )
        sum_row_group_size += row_group_size_per_file[file][row_group]
        num_row_groups += 1
        print(f"     {row_group}: {row_group_size_per_file[file][row_group]} MB")

print(f"Average row group size: {sum_row_group_size/num_row_groups} MB")
alexandria_3D_0.parquet
     0: 201.08054542541504 MB
     1: 220.59319686889648 MB
     2: 197.38053512573242 MB
     3: 187.26845264434814 MB
     4: 227.97186374664307 MB
     5: 192.6089096069336 MB
     6: 216.52346420288086 MB
     7: 198.00727558135986 MB
     8: 227.92319583892822 MB
     9: 226.87678337097168 MB
     10: 244.66320514678955 MB
     11: 215.52607440948486 MB
     12: 198.7313060760498 MB
     13: 204.19597339630127 MB
     14: 192.36825561523438 MB
     15: 217.40732860565186 MB
     16: 215.3526430130005 MB
     17: 201.5632438659668 MB
     18: 189.40394115447998 MB
     19: 200.09145641326904 MB
     20: 208.74722480773926 MB
     21: 206.02736282348633 MB
Average row group size: 208.65055626088923 MB

For the most optimal performance we should aim for 2GB per file and about 200-500MB per row group.

Currently for a row group with 32,768 rows the size of the row group is ~30MB. To get to 200MB we should put 200MB/30MB more rows in the row group or ~200,000 rows.

If each rowgroup is about 200MB at 200,000 rows then we should have 2GB/200MB = 10 row groups per file. or 2,000,000 rows per file.

[10]:
def normalize_dataset(db, normalize_config=NormalizeConfig()):
    task_name = "normalize_dataset"
    print("Starting task: normalize_dataset")

    db.normalize(normalize_config=normalize_config)
    print("Done with task: normalize_dataset")
    print("-" * 200)
    return task_name


normalize_config = NormalizeConfig(
    load_format="batches",  # Uses the batch generator to normalize
    batch_readahead=4,  # Controls the number of batches to load in memory a head of time. This will have impacts on amount of RAM consumed
    fragment_readahead=1,  # Controls the number of files to load in memory ahead of time. This will have impacts on amount of RAM consumed
    batch_size=10000,  # Controls the batchsize when to use when normalizing. This will have impacts on amount of RAM consumed
    max_rows_per_file=2000000,  # Controls the max number of rows per parquet file
    max_rows_per_group=200000,
    min_rows_per_group=200000,
)  # Controls the max number of rows per group parquet file
normalize_dataset(db, normalize_config=normalize_config)
print(db.summary(show_row_group_metadata=True))
Starting task: normalize_dataset
Done with task: normalize_dataset
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
============================================================
PARQUETDB SUMMARY
============================================================
Database path: Z:\data\parquetdb\data\external\alexandria\AlexandriaDB\alexandria_3D

• Number of columns: 128
• Number of rows: 4389295
• Number of files: 3
• Number of rows per file: [2000000, 2000000, 389295]
• Number of row groups per file: [10, 10, 2]
• Number of rows per row group per file:
    - alexandria_3D_0.parquet:
        - Row group 0: 200000 rows
        - Row group 1: 200000 rows
        - Row group 2: 200000 rows
        - Row group 3: 200000 rows
        - Row group 4: 200000 rows
        - Row group 5: 200000 rows
        - Row group 6: 200000 rows
        - Row group 7: 200000 rows
        - Row group 8: 200000 rows
        - Row group 9: 200000 rows
    - alexandria_3D_1.parquet:
        - Row group 0: 200000 rows
        - Row group 1: 200000 rows
        - Row group 2: 200000 rows
        - Row group 3: 200000 rows
        - Row group 4: 200000 rows
        - Row group 5: 200000 rows
        - Row group 6: 200000 rows
        - Row group 7: 200000 rows
        - Row group 8: 200000 rows
        - Row group 9: 200000 rows
    - alexandria_3D_2.parquet:
        - Row group 0: 200000 rows
        - Row group 1: 189295 rows
• Serialized metadata size per file: [180572, 178843, 48142] Bytes

############################################################
METADATA
############################################################

############################################################
COLUMN DETAILS
############################################################

Let’s check the file sizes and row group sizes again.

[11]:
file_sizes = list_file_sizes(db.db_path)

for file, size in file_sizes.items():
    print(f"{file}: {size} MB")


row_group_metadata_per_file = db.get_parquet_file_row_group_metadata_per_file(
    as_dict=True
)
row_group_size_per_file = {}
sum_row_group_size = 0
num_row_groups = 0
for file, row_group_metadata in row_group_metadata_per_file.items():
    print(f"{file}")
    row_group_size_per_file[file] = {}
    for row_group, metadata in row_group_metadata.items():
        row_group_size_per_file[file][row_group] = metadata["total_byte_size"] / (
            1024 * 1024
        )
        sum_row_group_size += row_group_size_per_file[file][row_group]
        num_row_groups += 1
        print(f"     {row_group}: {row_group_size_per_file[file][row_group]} MB")

print(f"Average row group size: {sum_row_group_size/num_row_groups} MB")
alexandria_3D_0.parquet: 1261.4758415222168 MB
alexandria_3D_1.parquet: 1251.6552095413208 MB
alexandria_3D_2.parquet: 254.18556213378906 MB
alexandria_3D_0.parquet
     0: 201.72274780273438 MB
     1: 219.45661544799805 MB
     2: 197.442889213562 MB
     3: 187.3084535598755 MB
     4: 227.44355964660645 MB
     5: 192.6088399887085 MB
     6: 216.52346420288086 MB
     7: 198.00727558135986 MB
     8: 227.92319583892822 MB
     9: 226.87678337097168 MB
alexandria_3D_1.parquet
     0: 244.66320514678955 MB
     1: 215.52607440948486 MB
     2: 198.7313995361328 MB
     3: 203.92721366882324 MB
     4: 191.9358901977539 MB
     5: 218.27143096923828 MB
     6: 215.63258838653564 MB
     7: 201.5486354827881 MB
     8: 191.0305995941162 MB
     9: 199.67366981506348 MB
alexandria_3D_2.parquet
     0: 208.74722480773926 MB
     1: 206.0274362564087 MB
Average row group size: 208.6831451329318 MB

Here we see that the row groups are about 200MB and the file sizes are a 1GB. This is a little smaller because the less files the more common metadata is stored together. I would try to store all the data in the same file in this case

[12]:
normalize_config = NormalizeConfig(
    load_format="batches",  # Uses the batch generator to normalize
    batch_readahead=4,  # Controls the number of batches to load in memory a head of time. This will have impacts on amount of RAM consumed
    fragment_readahead=1,  # Controls the number of files to load in memory ahead of time. This will have impacts on amount of RAM consumed
    batch_size=10000,  # Controls the batchsize when to use when normalizing. This will have impacts on amount of RAM consumed
    max_rows_per_file=5000000,  # Controls the max number of rows per parquet file
    max_rows_per_group=200000,
    min_rows_per_group=200000,
)  # Controls the max number of rows per group parquet file
normalize_dataset(db, normalize_config=normalize_config)
print(db.summary(show_row_group_metadata=True))

file_sizes = list_file_sizes(db.db_path)
for file, size in file_sizes.items():
    print(f"{file}: {size} MB")
Starting task: normalize_dataset
Done with task: normalize_dataset
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
============================================================
PARQUETDB SUMMARY
============================================================
Database path: Z:\data\parquetdb\data\external\alexandria\AlexandriaDB\alexandria_3D

• Number of columns: 128
• Number of rows: 4389295
• Number of files: 1
• Number of rows per file: [4389295]
• Number of row groups per file: [22]
• Number of rows per row group per file:
    - alexandria_3D_0.parquet:
        - Row group 0: 200000 rows
        - Row group 1: 200000 rows
        - Row group 2: 200000 rows
        - Row group 3: 200000 rows
        - Row group 4: 200000 rows
        - Row group 5: 200000 rows
        - Row group 6: 200000 rows
        - Row group 7: 200000 rows
        - Row group 8: 200000 rows
        - Row group 9: 200000 rows
        - Row group 10: 200000 rows
        - Row group 11: 200000 rows
        - Row group 12: 200000 rows
        - Row group 13: 200000 rows
        - Row group 14: 200000 rows
        - Row group 15: 200000 rows
        - Row group 16: 200000 rows
        - Row group 17: 200000 rows
        - Row group 18: 200000 rows
        - Row group 19: 200000 rows
        - Row group 20: 200000 rows
        - Row group 21: 189295 rows
• Serialized metadata size per file: [377929] Bytes

############################################################
METADATA
############################################################

############################################################
COLUMN DETAILS
############################################################

alexandria_3D_0.parquet: 2767.2842226028442 MB

Basic Operations

In this section we are going to test the performance of ParquetDB for basic operations.

Reading a single column

If one were to want a single property for all materials in alexandria, if it were in json format, we would have to iterate over and read all the json files to collect the property.

[13]:
total_time_to_read_from_json = sum(benchmark_dict["json_load_times"])


def read_single_column(db):
    task_name = "read_single_column"
    print("Starting task: read_single_column")
    table = db.read(columns=["id"], load_format="table")
    print(table.shape)
    print("-" * 200)
    return task_name


start_time = time.time()
task_name = read_single_column(db)
read_single_column_time = time.time() - start_time

task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_single_column_time)

print(f"Time to read from parquetdb: {read_single_column_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_single_column
(4389295, 1)
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 0.04 seconds
Time to read from json: 0.00 seconds

Reading specific ids

Now, if we were to want speific columns, at minimum time to do this in json format would be the same time as iterating over the json files.

In ParquetDB, it is much simpler and less memory intensive to read specific ids.

[14]:
def read_specific_ids(db):
    task_name = "read_specific_ids"
    print("Starting task: read_specific_ids")
    table = db.read(
        ids=[
            0,
            10,
            100,
            1000,
            10000,
            100000,
            1000000,
        ],  # Controls which rows we want to read
        load_format="table",  # Controls the output format. The options are 'table', 'batches', `dataset`.
    )

    df = table.to_pandas()  # Converts the table to a pandas dataframe
    print(df["id"])
    print(df.head())
    print(df.shape)

    print(f"Data : {df.iloc[0]['data.spg']}")
    print(list(df.columns))
    print("Done with task: read_specific_ids")
    print("-" * 200)
    return task_name


start_time = time.time()
task_name = read_specific_ids(db)
read_specific_ids_time = time.time() - start_time

task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_specific_ids_time)

print(f"Time to read from parquetdb: {read_specific_ids_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_specific_ids
0      10000
1          0
2         10
3        100
4       1000
5     100000
6    1000000
Name: id, dtype: int64
                   @class                            @module  composition.Ac  \
0  ComputedStructureEntry  pymatgen.entries.computed_entries             2.0
1  ComputedStructureEntry  pymatgen.entries.computed_entries             1.0
2  ComputedStructureEntry  pymatgen.entries.computed_entries             1.0
3  ComputedStructureEntry  pymatgen.entries.computed_entries             1.0
4  ComputedStructureEntry  pymatgen.entries.computed_entries             2.0

   composition.Ag  composition.Al  composition.Ar  composition.As  \
0             NaN             NaN             NaN             NaN
1             NaN             NaN             NaN             NaN
2             NaN             NaN             NaN             NaN
3             NaN             NaN             NaN             NaN
4             NaN             NaN             NaN             NaN

   composition.Au  composition.B  composition.Ba  ...  structure.lattice.a  \
0             NaN            NaN             NaN  ...             4.829086
1             NaN            NaN             NaN  ...            10.091510
2             NaN            NaN             NaN  ...            15.519920
3             NaN            NaN             NaN  ...             5.789011
4             NaN            NaN             NaN  ...             8.695520

   structure.lattice.alpha  structure.lattice.b  structure.lattice.beta  \
0                90.000000             4.829086               90.000000
1               109.471217            10.091510              109.471222
2                89.997963            11.000739               89.996986
3               111.411199             8.109282               99.469696
4                69.601810             7.709933               55.506047

   structure.lattice.c  structure.lattice.gamma  \
0             9.745281                90.000000
1            10.091511               109.471219
2             5.192681                45.191179
3             8.109282                99.469696
4             7.709933                55.506037

                            structure.lattice.matrix  structure.lattice.pbc  \
0  [4.82908586, 0.0, 0.0, 0.0, 4.82908586, 0.0, 0...     [True, True, True]
1  [9.51436671, 2e-08, -3.36383678, -4.75718337, ...     [True, True, True]
2  [15.51989271, -0.00721914, 0.02799947, 7.75631...     [True, True, True]
3  [5.78373142, -0.13705652, -0.20571227, -1.1463...     [True, True, True]
4  [7.92465233, 0.98010515, 3.44257929, 3.2101699...     [True, True, True]

   structure.lattice.volume                                    structure.sites
0                227.260645  [{'abc': [0.0, 0.5, 0.2399718], 'label': 'Ac',...
1                791.127799  [{'abc': [0.0, 0.0, 0.0], 'label': 'Ac', 'prop...
2                628.973408  [{'abc': [3e-06, 3e-06, 0.0], 'label': 'Ac', '...
3                338.968680  [{'abc': [0.5, 0.0, 0.0], 'label': 'Ac', 'prop...
4                350.818535  [{'abc': [0.83221258, 0.94982224, 0.94982224],...

[5 rows x 128 columns]
(7, 128)
[dtype('O'), dtype('int64'), dtype('float64')]
Data : 129
['@class', '@module', 'composition.Ac', 'composition.Ag', 'composition.Al', 'composition.Ar', 'composition.As', 'composition.Au', 'composition.B', 'composition.Ba', 'composition.Be', 'composition.Bi', 'composition.Br', 'composition.C', 'composition.Ca', 'composition.Cd', 'composition.Ce', 'composition.Cl', 'composition.Co', 'composition.Cr', 'composition.Cs', 'composition.Cu', 'composition.Dy', 'composition.Er', 'composition.Eu', 'composition.F', 'composition.Fe', 'composition.Ga', 'composition.Gd', 'composition.Ge', 'composition.H', 'composition.He', 'composition.Hf', 'composition.Hg', 'composition.Ho', 'composition.I', 'composition.In', 'composition.Ir', 'composition.K', 'composition.Kr', 'composition.La', 'composition.Li', 'composition.Lu', 'composition.Mg', 'composition.Mn', 'composition.Mo', 'composition.N', 'composition.Na', 'composition.Nb', 'composition.Nd', 'composition.Ne', 'composition.Ni', 'composition.Np', 'composition.O', 'composition.Os', 'composition.P', 'composition.Pa', 'composition.Pb', 'composition.Pd', 'composition.Pm', 'composition.Pr', 'composition.Pt', 'composition.Pu', 'composition.Rb', 'composition.Re', 'composition.Rh', 'composition.Ru', 'composition.S', 'composition.Sb', 'composition.Sc', 'composition.Se', 'composition.Si', 'composition.Sm', 'composition.Sn', 'composition.Sr', 'composition.Ta', 'composition.Tb', 'composition.Tc', 'composition.Te', 'composition.Th', 'composition.Ti', 'composition.Tl', 'composition.Tm', 'composition.U', 'composition.V', 'composition.W', 'composition.Xe', 'composition.Y', 'composition.Yb', 'composition.Zn', 'composition.Zr', 'correction', 'data.band_gap_dir', 'data.band_gap_ind', 'data.decomposition', 'data.dos_ef', 'data.e_above_hull', 'data.e_form', 'data.e_phase_separation', 'data.elements', 'data.energy_corrected', 'data.energy_total', 'data.formula', 'data.location', 'data.mat_id', 'data.nsites', 'data.prototype_id', 'data.spg', 'data.stress', 'data.total_mag', 'energy', 'energy_adjustments', 'entry_id', 'id', 'parameters.dummy_field', 'structure.@class', 'structure.@module', 'structure.charge', 'structure.lattice.a', 'structure.lattice.alpha', 'structure.lattice.b', 'structure.lattice.beta', 'structure.lattice.c', 'structure.lattice.gamma', 'structure.lattice.matrix', 'structure.lattice.pbc', 'structure.lattice.volume', 'structure.sites']
Done with task: read_specific_ids
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 4.16 seconds
Time to read from json: 0.00 seconds

Finding the minimum and maximum of a energy

[15]:
def read_energy_min_max(db):
    task_name = "read_energy_min_max"
    print("Starting task: read_energy_min_max")
    table = db.read(columns=["energy"], load_format="table")
    print(table.shape)

    result = pc.min_max(table["energy"])
    # The result will be a struct with 'min' and 'max' fields
    min_value = result["min"].as_py()
    max_value = result["max"].as_py()

    print(f"Min: {min_value}, Max: {max_value}")
    print("Done with task: read_energy_min_max")
    print("-" * 200)
    return task_name


start_time = time.time()
task_name = read_energy_min_max(db)
read_energy_min_max_time = time.time() - start_time

task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_energy_min_max_time)

print(f"Time to read from parquetdb: {read_energy_min_max_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_energy_min_max
(4389295, 1)
Min: -1496.5922219, Max: -0.003981
Done with task: read_energy_min_max
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 0.35 seconds
Time to read from json: 0.00 seconds

Reading records filtered by energy above -1.0

[16]:
def read_filtered_energy_above_minus_one(db):
    task_name = "read_filtered_energy_above_-1"

    """Read records filtered by energy above -1.0 and track timing."""
    print("Starting task: read_filtered_energy_above_minus_one")

    table = db.read(
        columns=["id", "energy"],
        filters=[pc.field("energy") > -1.0],
        load_format="table",
    )

    df = table.to_pandas()  # Converts the table to a pandas dataframe
    print(df.head())
    print(df.shape)
    print("Done with task: read_filtered_energy_above_minus_one")
    print("-" * 200)
    return task_name


start_time = time.time()
task_name = read_filtered_energy_above_minus_one(db)
read_filtered_energy_above_minus_one_time = time.time() - start_time

task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_filtered_energy_above_minus_one_time)

print(
    f"Time to read from parquetdb: {read_filtered_energy_above_minus_one_time:.2f} seconds"
)
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_filtered_energy_above_minus_one
       id    energy
0  123136 -0.063105
1  123137 -0.125970
2  403318 -0.972671
3  570682 -0.907343
4  570683 -0.901483
(46, 2)
Done with task: read_filtered_energy_above_minus_one
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 0.04 seconds
Time to read from json: 0.00 seconds

Reading records filtered by spg 204

[17]:
def read_filtered_spg_204(db):
    task_name = "read_filtered_spg_204_table"

    print("Starting task: read_filtered_spg_204")
    table = db.read(
        columns=["id", "data.spg"],
        filters=[pc.field("data.spg") == 204],
        load_format="table",
    )

    df = table.to_pandas()  # Converts the table to a pandas dataframe
    print(df.head())
    print(df.shape)

    print("Done with task: read_filtered_spg_204")
    print("-" * 200)
    return task_name


start_time = time.time()
task_name = read_filtered_spg_204(db)
read_filtered_spg_204_time = time.time() - start_time

task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_filtered_spg_204_time)

print(f"Time to read from parquetdb: {read_filtered_spg_204_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_filtered_spg_204
      id  data.spg
0  10113       204
1  10125       204
2  10126       204
3  10133       204
4  10140       204
(7240, 2)
Done with task: read_filtered_spg_204
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 0.05 seconds
Time to read from json: 0.00 seconds

Reading records filtered by spg batches

[18]:
def read_filtered_spg_batches(db):
    task_name = "read_filtered_spg_batches"
    print("Starting task: read_filtered_spg_batches")
    generator = db.read(
        load_format="batches",
        batch_size=1000,
        load_config=LoadConfig(
            batch_readahead=10,
            fragment_readahead=2,
            fragment_scan_options=None,
            use_threads=True,
            memory_pool=None,
        ),
        columns=["id", "data.spg"],
        filters=[pc.field("data.spg") == 204],
    )

    batch_count = 0
    num_rows = 0
    for table in generator:
        df = table.to_pandas()
        num_rows += table.num_rows
        batch_count += 1
    print(f"Total number of rows: {num_rows}, Batches: {batch_count}")
    print("Done with task: read_filtered_spg_batches")
    print("-" * 200)
    return task_name


start_time = time.time()
task_name = read_filtered_spg_batches(db)
read_filtered_spg_batches_time = time.time() - start_time

task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_filtered_spg_batches_time)

print(f"Time to read from parquetdb: {read_filtered_spg_batches_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_filtered_spg_batches
Total number of rows: 7240, Batches: 4390
Done with task: read_filtered_spg_batches
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 0.76 seconds
Time to read from json: 0.00 seconds

Reading lattice matrix for space group 204

[19]:
def read_lattice_matrix_spg_204(db):
    task_name = "read_lattice_matrix_spg_204"

    print("Starting task: read_lattice_matrix_spg_204")

    table = db.read(
        columns=["structure.lattice.matrix"], filters=[pc.field("data.spg") == 204]
    )
    lattice = table["structure.lattice.matrix"].combine_chunks().to_numpy_ndarray()
    print(lattice.shape)
    print("Done with task: read_lattice_matrix_spg_204")
    print("-" * 200)
    return task_name


start_time = time.time()
task_name = read_lattice_matrix_spg_204(db)
read_lattice_matrix_spg_204_time = time.time() - start_time

task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_lattice_matrix_spg_204_time)

print(f"Time to read from parquetdb: {read_lattice_matrix_spg_204_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_lattice_matrix_spg_204
(7240, 3, 3)
Done with task: read_lattice_matrix_spg_204
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 0.63 seconds
Time to read from json: 0.00 seconds

Reading nested column selection

[20]:
def read_nested_column_selection(db):
    task_name = "read_nested_column_selection"

    print("Starting task: read_nested_column_selection")
    table = db.read(columns=["id", "structure.sites"], load_format="table")

    print(table.shape)
    print(table["structure.sites"].type)
    print(table["structure.sites"].combine_chunks().type)
    print("Done with task: read_nested_column_selection")
    print("-" * 200)
    return task_name


start_time = time.time()
task_name = read_nested_column_selection(db)
read_nested_column_selection_time = time.time() - start_time

task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(read_nested_column_selection_time)

print(f"Time to read from parquetdb: {read_nested_column_selection_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_nested_column_selection
(4389295, 2)
list<element: struct<abc: list<element: double>, label: string, properties: struct<charge: double, forces: list<element: double>, magmom: double>, species: list<element: struct<element: string, occu: int64>>, xyz: list<element: double>>>
list<element: struct<abc: list<element: double>, label: string, properties: struct<charge: double, forces: list<element: double>, magmom: double>, species: list<element: struct<element: string, occu: int64>>, xyz: list<element: double>>>
Done with task: read_nested_column_selection
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 3.53 seconds
Time to read from json: 0.00 seconds

Nested structure into class

[21]:
def read_nested_structure_into_class(db):
    # By default the database flattens nested structure for storage.
    # However, we provide an option to rebuild the nested structure. This will create a new dataset in {dataset_name}_nested.
    # After the creation of the new dataset, the query parameters are applied to the new dataset.
    task_name = "read_nested_structure_into_class"

    print("Starting task: read_nested_structure_into_class")
    table = db.read(
        columns=[
            "id",
            "structure",
            "data",
        ],  # Instead of using the flatten syntax, we can use the nested syntax
        ids=[0, 1000000],
        load_format="table",
        rebuild_nested_struct=True,  # When set to True to rebuild the nested structure
        rebuild_nested_from_scratch=False,  # When set to True, the nested structure will be rebuilt from scratch
        normalize_config=NormalizeConfig(
            load_format="batches",
            batch_readahead=2,
            fragment_readahead=1,
            batch_size=10000,
            max_rows_per_file=5000000,
            min_rows_per_group=200000,
            max_rows_per_group=200000,
        ),
    )

    print(table.shape)
    print(table["data"].type)

    print("structure type")
    print(table["structure"].type)
    try:
        from pymatgen.core.structure import Structure

        structure = Structure.from_dict(
            table["structure"].combine_chunks().to_pylist()[0]
        )

        print(structure)
    except Exception as e:
        print(e)
    print("Done with task: read_nested_structure_into_class")
    print("-" * 200)
    return task_name


start_time = time.time()
read_nested_structure_into_class(db)
nested_structure_time = time.time() - start_time

print(f"Time to read from parquetdb: {nested_structure_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_nested_structure_into_class
(2, 3)
struct<band_gap_dir: double, band_gap_ind: double, decomposition: string, dos_ef: double, e_above_hull: double, e_form: double, e_phase_separation: double, elements: list<element: string>, energy_corrected: double, energy_total: double, formula: string, location: string, mat_id: string, nsites: int64, prototype_id: string, spg: int64, stress: extension<arrow.fixed_shape_tensor[value_type=double, shape=[3,3]]>, total_mag: double>
structure type
struct<@class: string, @module: string, charge: int64, lattice: struct<a: double, alpha: double, b: double, beta: double, c: double, gamma: double, matrix: extension<arrow.fixed_shape_tensor[value_type=double, shape=[3,3]]>, pbc: extension<arrow.fixed_shape_tensor[value_type=bool, shape=[3]]>, volume: double>, sites: list<element: struct<abc: list<element: double>, label: string, properties: struct<charge: double, forces: list<element: double>, magmom: double>, species: list<element: struct<element: string, occu: int64>>, xyz: list<element: double>>>>
Full Formula (Ac1 Pr12 Ho7)
Reduced Formula: AcPr12Ho7
abc   :  10.091510  10.091510  10.091511
angles: 109.471217 109.471222 109.471219
pbc   :       True       True       True
Sites (20)
  #  SP           a         b         c    charge  forces                                     magmom
---  ----  --------  --------  --------  --------  ---------------------------------------  --------
  0  Ac    0         0         0            8.076  [0.0, -0.0, -0.0]                              -0
  1  Pr    0.476206  0.707375  0.768831     8.873  [0.0022184, 0.00231648, -0.00126846]           -0
  2  Pr    0.938545  0.707375  0.231169     8.873  [0.00045645, 0.00231648, -0.00251434]          -0
  3  Pr    0.523794  0.292625  0.231169     8.873  [-0.0022184, -0.00231648, 0.00126846]          -0
  4  Pr    0.061455  0.292625  0.768831     8.873  [-0.00045645, -0.00231648, 0.00251434]         -0
  5  Pr    0.768831  0.061455  0.292625     8.873  [0.00177791, -0.00155354, -0.00251434]         -0
  6  Pr    0.768831  0.476206  0.707375     8.873  [-0.00089694, 0.00307943, 0.00126846]          -0
  7  Pr    0.231169  0.938545  0.707375     8.873  [-0.00177791, 0.00155354, 0.00251434]          -0
  8  Pr    0.231169  0.523794  0.292625     8.873  [0.00089694, -0.00307943, -0.00126846]         -0
  9  Pr    0.707375  0.768831  0.476206     8.873  [0.00223436, 0.00076295, 0.00251434]           -0
 10  Pr    0.292625  0.231169  0.523794     8.873  [-0.00223436, -0.00076295, -0.00251434]        -0
 11  Pr    0.707375  0.231169  0.938545     8.873  [0.00311533, -0.00076295, 0.00126846]          -0
 12  Pr    0.292625  0.768831  0.061455     8.873  [-0.00311533, 0.00076295, -0.00126846]         -0
 13  Ho    0.5       0.5       0            7.546  [-0.0, -0.0, -0.0]                              0
 14  Ho    0         0.5       0.5          7.546  [0.0, -0.0, -0.0]                               0
 15  Ho    0.5       0         0.5          7.546  [-0.0, -0.0, -0.0]                              0
 16  Ho    0.5       0.5       0.5          8.703  [-0.0, -0.0, -0.0]                              0
 17  Ho    0         0         0.5          8.703  [-0.0, -0.0, -0.0]                              0
 18  Ho    0.5       0         0            8.703  [-0.0, -0.0, -0.0]                              0
 19  Ho    0         0.5       0            8.703  [-0.0, -0.0, -0.0]                              0
Done with task: read_nested_structure_into_class
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 26.94 seconds
Time to read from json: 0.00 seconds

In the previous cell it takes ~ 60 seconds to perform the operation, this is due to reconstructing the nested structure. Further queries will be faster as the nested structure is already built.

[22]:
start_time = time.time()
task_name = read_nested_structure_into_class(db)
nested_structure_time = time.time() - start_time

task_benchmark_dict["task_names"].append(task_name)
task_benchmark_dict["task_times"].append(nested_structure_time)

print(f"Time to read from parquetdb: {nested_structure_time:.2f} seconds")
print(f"Time to read from json: {total_time_to_read_from_json:.2f} seconds")
Starting task: read_nested_structure_into_class
(2, 3)
struct<band_gap_dir: double, band_gap_ind: double, decomposition: string, dos_ef: double, e_above_hull: double, e_form: double, e_phase_separation: double, elements: list<element: string>, energy_corrected: double, energy_total: double, formula: string, location: string, mat_id: string, nsites: int64, prototype_id: string, spg: int64, stress: extension<arrow.fixed_shape_tensor[value_type=double, shape=[3,3]]>, total_mag: double>
structure type
struct<@class: string, @module: string, charge: int64, lattice: struct<a: double, alpha: double, b: double, beta: double, c: double, gamma: double, matrix: extension<arrow.fixed_shape_tensor[value_type=double, shape=[3,3]]>, pbc: extension<arrow.fixed_shape_tensor[value_type=bool, shape=[3]]>, volume: double>, sites: list<element: struct<abc: list<element: double>, label: string, properties: struct<charge: double, forces: list<element: double>, magmom: double>, species: list<element: struct<element: string, occu: int64>>, xyz: list<element: double>>>>
Full Formula (Ac1 Pr12 Ho7)
Reduced Formula: AcPr12Ho7
abc   :  10.091510  10.091510  10.091511
angles: 109.471217 109.471222 109.471219
pbc   :       True       True       True
Sites (20)
  #  SP           a         b         c    charge  forces                                     magmom
---  ----  --------  --------  --------  --------  ---------------------------------------  --------
  0  Ac    0         0         0            8.076  [0.0, -0.0, -0.0]                              -0
  1  Pr    0.476206  0.707375  0.768831     8.873  [0.0022184, 0.00231648, -0.00126846]           -0
  2  Pr    0.938545  0.707375  0.231169     8.873  [0.00045645, 0.00231648, -0.00251434]          -0
  3  Pr    0.523794  0.292625  0.231169     8.873  [-0.0022184, -0.00231648, 0.00126846]          -0
  4  Pr    0.061455  0.292625  0.768831     8.873  [-0.00045645, -0.00231648, 0.00251434]         -0
  5  Pr    0.768831  0.061455  0.292625     8.873  [0.00177791, -0.00155354, -0.00251434]         -0
  6  Pr    0.768831  0.476206  0.707375     8.873  [-0.00089694, 0.00307943, 0.00126846]          -0
  7  Pr    0.231169  0.938545  0.707375     8.873  [-0.00177791, 0.00155354, 0.00251434]          -0
  8  Pr    0.231169  0.523794  0.292625     8.873  [0.00089694, -0.00307943, -0.00126846]         -0
  9  Pr    0.707375  0.768831  0.476206     8.873  [0.00223436, 0.00076295, 0.00251434]           -0
 10  Pr    0.292625  0.231169  0.523794     8.873  [-0.00223436, -0.00076295, -0.00251434]        -0
 11  Pr    0.707375  0.231169  0.938545     8.873  [0.00311533, -0.00076295, 0.00126846]          -0
 12  Pr    0.292625  0.768831  0.061455     8.873  [-0.00311533, 0.00076295, -0.00126846]         -0
 13  Ho    0.5       0.5       0            7.546  [-0.0, -0.0, -0.0]                              0
 14  Ho    0         0.5       0.5          7.546  [0.0, -0.0, -0.0]                               0
 15  Ho    0.5       0         0.5          7.546  [-0.0, -0.0, -0.0]                              0
 16  Ho    0.5       0.5       0.5          8.703  [-0.0, -0.0, -0.0]                              0
 17  Ho    0         0         0.5          8.703  [-0.0, -0.0, -0.0]                              0
 18  Ho    0.5       0         0            8.703  [-0.0, -0.0, -0.0]                              0
 19  Ho    0         0.5       0            8.703  [-0.0, -0.0, -0.0]                              0
Done with task: read_nested_structure_into_class
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Time to read from parquetdb: 2.67 seconds
Time to read from json: 0.00 seconds

Plotting times

[23]:
import matplotlib.pyplot as plt
from parquetdb.utils import matplotlib_utils
from matplotlib import rcParams
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

%matplotlib inline
[24]:
xlabel_size = 16
ylabel_size = 16
title_size = 16
xtick_size = 14
ytick_size = 12

inset_xtick_size = 10
inset_ytick_size = 10
inset_title_size = 12

labels = task_benchmark_dict["task_names"]
times = task_benchmark_dict["task_times"]
# Create the main plot with numbered x labels and an inset showing the same data on a log scale
fig, ax = plt.subplots(figsize=(10, 6))

# Number the labels
numbered_labels = [f"{i+1}. {label}" for i, label in enumerate(labels)]

# matplotlib_utils.set_palette('Cavalcanti1')
matplotlib_utils.set_palette("Darjeeling1_alt")
# matplotlib_utils.set_palette('Zissou1')
# matplotlib_utils.set_palette('AsteroidCity1')
# matplotlib_utils.set_palette('BottleRocket2')
colors = rcParams["axes.prop_cycle"].by_key()["color"]
# Main horizontal bar plotcolors[:len(times)]
# ax.barh(numbered_labels, times, color="#59b9de")
ax.barh(numbered_labels, times, color=colors[: len(times)])
ax.set_xlabel("Total Time (seconds)", fontsize=xlabel_size)
ax.set_ylabel("Operations", fontsize=ylabel_size)

ax.tick_params(axis="x", labelsize=xtick_size)
ax.tick_params(axis="y", labelsize=ytick_size)
ax.set_title(
    "Total Time for Various Operations on dataset with 4.8 million rows",
    fontsize=title_size,
)

# Inset plot with log scale and just the numbers
# ax_inset = inset_axes(ax, width="40%", height="30%", loc="center right")

ax_inset = inset_axes(
    ax,
    width="30%",
    height="30%",
    loc="center right",
    bbox_to_anchor=(-0.05, -0.05, 1, 1),
    bbox_transform=ax.transAxes,
)
ax_inset.barh(range(1, len(labels) + 1), times, color="#e52207")
ax_inset.barh(range(1, len(labels) + 1), times, color=colors[: len(times)])
ax_inset.set_xscale("log")
ax_inset.set_yticks(range(1, len(labels) + 1))  # Show just the numbers
ax_inset.set_yticklabels(range(1, len(labels) + 1), fontsize=inset_ytick_size)
ax_inset.set_title("Log Scale", fontsize=inset_title_size)

# Adjust layout and show the plot
plt.tight_layout()

plt.show()
C:\Users\lllang\AppData\Local\Temp\ipykernel_67684\1793184629.py:57: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  plt.tight_layout()
../../_images/examples_notebooks_Example_1_-_3D_Alexandria_Database_49_1.png