Working with Arrays in a TileDB Group

This notebook contains an example of how to use TileDB and TileDB-CF to create, inspect, open, read, and write data to arrays in a TileDB group.

Data

The following arrays will be created that share some of their dimensions:

  • dense_3d is a dense array with dimensions dense_x, dense_y and dense_t
  • dense_2d is a dense array with dimensions dense_x and dense_y
  • dense_1d is a dense array with dimension dense_t
  • sparse_4d is a sparse array with dimensions sparse_x sparse_y sparse_z and sparse_t
  • dense_axes_xy is a dense array that contains the values of the dense_x and dense_y dimensions as dense_x_data and dense_y_data
  • dense_axes_t is a dense array that contains the values of the dense_t dimension as dense_t_data
  • sparse_axes is a sparse array that contains the values of the sparse_x sparse_y sparse_z and sparse_t dimensions as sparse_x_data sparse_y_data sparse_z_data and sparse_t_data

Packages

Import the libraries used in this notebook:

import time
import numpy as np
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import tiledb
from tiledb.cf import AttrMetadata, ArrayMetadata, create_group, open_group_array

Create numpy arrays

Variables to set the size of the arrays:

dense_size = 100
sparse_size = 2000
t_size = 365

Functions used to create a dataset:

def ripple(x, y, t):
    return np.sin(t * (x**2 + y**2)) / (t + 1)


def ripple2(x, y, z, t):
    return (np.sin(t * (x**2 + y**2)) / (t + 1)) + z

Data for the dense arrays

dense_x_values = np.arange(1, dense_size + 1)
dense_y_values = np.arange(1, dense_size + 1)
dense_t_values = np.arange(1, t_size + 1)

dense_3d_values = np.fromfunction(
    lambda x, y, t: ripple(x, y, t), (dense_size, dense_size, t_size)
)

dense_2d_values = np.nanmean(dense_3d_values, axis=2)
dense_1d_values = np.mean(dense_3d_values, axis=(0, 1))
dense_3d_values = np.fromfunction(
    lambda x, y, t: ripple(x, y, t), (dense_size, dense_size, t_size)
)

dense_2d_values = np.nanmean(dense_3d_values, axis=2)
dense_1d_values = np.mean(dense_3d_values, axis=(0, 1))

Data for the sparse arrays

sparse_x_values = np.random.randint(1, dense_size + 1, size=(sparse_size))
sparse_y_values = np.random.randint(1, dense_size + 1, size=(sparse_size))
sparse_z_values = np.random.randint(1, dense_size + 1, size=(sparse_size))
sparse_t_values = np.random.randint(1, t_size + 1, size=(sparse_size))

sparse_4d_values = ripple2(
    sparse_x_values, sparse_y_values, sparse_z_values, sparse_t_values
)

Create the TileDB Group

dense_x = tiledb.Dim(name="dense_x", domain=(1, dense_size), tile=10, dtype=np.uint64)
dense_y = tiledb.Dim(name="dense_y", domain=(1, dense_size), tile=10, dtype=np.uint64)
dense_t = tiledb.Dim(name="dense_t", domain=(1, t_size), tile=10, dtype=np.uint64)
sparse_x = tiledb.Dim(
    name="sparse_x", domain=(1, sparse_size), tile=10, dtype=np.uint64
)
sparse_y = tiledb.Dim(
    name="sparse_y", domain=(1, sparse_size), tile=10, dtype=np.uint64
)
sparse_z = tiledb.Dim(
    name="sparse_z", domain=(1, sparse_size), tile=10, dtype=np.uint64
)
sparse_t = tiledb.Dim(name="sparse_t", domain=(1, t_size), tile=10, dtype=np.uint64)
array_schemas = {
    "dense_3d": tiledb.ArraySchema(
        domain=tiledb.Domain(dense_x, dense_y, dense_t),
        attrs=[
            tiledb.Attr(name="dense_3d_data", dtype=np.float64),
        ],
    ),
    "dense_2d": tiledb.ArraySchema(
        domain=tiledb.Domain(dense_x, dense_y),
        attrs=[
            tiledb.Attr(name="dense_2d_data", dtype=np.float64),
        ],
    ),
    "dense_1d": tiledb.ArraySchema(
        domain=tiledb.Domain(dense_t),
        attrs=[
            tiledb.Attr(name="dense_1d_data", dtype=np.float64),
        ],
    ),
    "sparse_4d": tiledb.ArraySchema(
        domain=tiledb.Domain(sparse_x, sparse_y, sparse_z, sparse_t),
        attrs=[
            tiledb.Attr(name="sparse_4d_data", dtype=np.float64),
        ],
        sparse=True,
        allows_duplicates=True,
    ),
    "dense_axes_xy": tiledb.ArraySchema(
        domain=tiledb.Domain(dense_x),
        attrs=[
            tiledb.Attr(name="dense_x_data", dtype=np.uint64),
            tiledb.Attr(name="dense_y_data", dtype=np.uint64),
        ],
    ),
    "dense_axes_t": tiledb.ArraySchema(
        domain=tiledb.Domain(dense_t),
        attrs=[
            tiledb.Attr(name="dense_t_data", dtype=np.uint64),
        ],
    ),
    "sparse_axes": tiledb.ArraySchema(
        domain=tiledb.Domain(sparse_x),
        attrs=[
            tiledb.Attr(name="sparse_x_data", dtype=np.uint64),
            tiledb.Attr(name="sparse_y_data", dtype=np.uint64),
            tiledb.Attr(name="sparse_z_data", dtype=np.uint64),
            tiledb.Attr(name="sparse_t_data", dtype=np.uint64),
        ],
        sparse=True,
        allows_duplicates=True,
    ),
}

Create the TileDB Group on disk

group_uri = "output/example_group"
if tiledb.object_type(group_uri) is None:
    create_group("output/example_group", array_schemas)

Write data to the TileDB Group

Writing to dense and sparse arrays uses standard TileDB write operations. Arrays can be opened one-by-one or multiple arrays can be opened at once.

with tiledb.Group(group_uri, mode="r") as group:
    with (
        open_group_array(group, attr="dense_3d_data", mode="w") as dense_3d_array,
        open_group_array(group, attr="dense_2d_data", mode="w") as dense_2d_array,
        open_group_array(group, attr="dense_1d_data", mode="w") as dense_1d_array,
    ):
        dense_3d_array[:] = dense_3d_values
        dense_2d_array[:] = dense_2d_values
        dense_1d_array[:] = dense_1d_values
    with (
        open_group_array(group, attr="sparse_4d_data", mode="w") as sparse_data_array,
        open_group_array(group, array="sparse_axes", mode="w") as sparse_axes_array,
    ):
        sparse_data_array[
            sparse_x_values, sparse_y_values, sparse_z_values, sparse_t_values
        ] = sparse_4d_values
        sparse_axes_array[np.arange(sparse_size) + 1] = {
            "sparse_x_data": sparse_x_values,
            "sparse_y_data": sparse_y_values,
            "sparse_z_data": sparse_z_values,
            "sparse_t_data": sparse_t_values,
        }
    with (
        open_group_array(group, array="dense_axes_xy", mode="w") as xy_array,
        open_group_array(
            group, array="dense_axes_t", attr="dense_t_data", mode="w"
        ) as t_array,
    ):
        xy_array[:] = {"dense_x_data": dense_x_values, "dense_y_data": dense_y_values}
        t_array[:] = dense_t_values

Metadata

Write Group metadata:

with tiledb.Group(group_uri, mode="w") as group:
    group.meta["description"] = "Example TileDB Group"
    group.meta["version"] = "1.0"
    group.meta["created on "] = time.ctime(time.time())

Write Array metadata:

with tiledb.Group(group_uri, mode="r") as group:
    with open_group_array(group, array="dense_3d", mode="w") as dense_3d_array:
        dense_3d_array.meta[
            "description"
        ] = "Example 3D dense array with dimensions x, y and t"
    with open_group_array(group, array="dense_axes_xy", mode="w") as xy_array:
        xy_array.meta[
            "description"
        ] = "Values for the x and y dimensions of the 3D dense array"

Write Attribute metadata using the AttrMetadata class:

with tiledb.Group(group_uri, mode="r") as group:
    with open_group_array(group, array="dense_axes_xy", mode="w") as array:
        x_attr_meta = AttrMetadata(array.meta, "dense_x_data")
        x_attr_meta["description"] = "Values of x"

Read data from the TileDB Group

Read the metadata

Read the Group metadata keys and their values:

with tiledb.Group(group_uri) as group:
    for key, value in group.meta.items():
        print(f"{key}: {value}")

Read the metadata for the dense_axes_xy array. Optionally, you can filter the array attribute data using the AttrMetadata and ArrayMetadata.

with tiledb.Group(group_uri) as group:
    with open_group_array(group, array="dense_axes_xy") as array:
        print("All metadata in the array:")
        for key, value in array.meta.items():
            print(f"    * {key}: {value}")
        print("Only array metadata:")
        for key, value in ArrayMetadata(array.meta).items():
            print(f"    * {key}: {value}")
        print("Only attribute metadata for 'dense_x_data':")
        for key, value in AttrMetadata(array.meta, "dense_x_data").items():
            print(f"    * {key}: {value}")

Read and visualise the data

with tiledb.Group(group_uri) as group:
    with (
        open_group_array(group, array="dense_3d", attr="dense_3d_data") as data_array,
        open_group_array(group, array="dense_axes_xy") as axes_array,
    ):
        dense_3d_data = data_array[:]
        axes_data = axes_array[...]
        dense_x_data = axes_data["dense_x_data"]
        dense_y_data = axes_data["dense_y_data"]
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
axes[0, 0].contourf(dense_x_data, dense_y_data, dense_3d_data[:, :, 33])
axes[0, 1].contourf(dense_x_data, dense_y_data, dense_3d_data[:, :, 66])
axes[1, 0].contourf(dense_x_data, dense_y_data, dense_3d_data[:, :, 99])
axes[1, 1].contourf(dense_x_data, dense_y_data, dense_3d_data[:, :, 95]);
with tiledb.Group(group_uri) as group:
    with open_group_array(group, array="sparse_4d") as array:
        df = pd.DataFrame(array[...])

df.head()
df.describe().transpose()
df.plot.scatter(x="sparse_x", y="sparse_4d_data");
with tiledb.Group(group_uri) as group:
    with open_group_array(group, array="dense_1d") as array:
        df2 = pd.DataFrame(array[...])

df2.head()
df2.plot();