import netCDF4
import numpy as np
import tiledb
import tiledb.cf
NetCDF-To-TileDB: How to set tile sizes
About this Example
What it Shows
This example shows how to set the tile size for TileDB arrays that will be created by a NetCDF4ConverterEngine
.
Example dataset
This example shows convertering a small NetCDF file with 2 dimensions and 4 variables:
- Dimensions:
- x: size=100
- y: size=100
- Variables:
- x(x)
- y(y)
- A1(x, y)
- A1(x, y)
Set-up Requirements
This example requires the following python packages are installed: netCDF4, numpy, tiledb, tiledb-cf
# Set names for the output generated by the example.
= "output/netcdf-to-tiledb-set-filters"
output_dir = f"{output_dir}/simple1.nc" netcdf_file
# Reset output folder
import os
import shutil
=True)
shutil.rmtree(output_dir, ignore_errors os.mkdir(output_dir)
# Create sample NetCDF file with no data
with netCDF4.Dataset(netcdf_file, mode="w") as dataset:
"x", 100)
dataset.createDimension("y", 100)
dataset.createDimension(= dataset.createVariable("A1", np.float64, ("x", "y"))
A1 = dataset.createVariable("A2", np.float64, ("x", "y"))
A2 = dataset.createVariable("x", np.float64, ("x",))
x1 = dataset.createVariable("y", np.float64, ("y",))
y print(f"Created example NetCDF file `{netcdf_file}`.")
Setting tile size after auto-generation
When using the NetCDF4ConverterEngine
class, the most straight forward way to set tile sizes is by directly setting them after the arrays have been added to the converter engine. This can be done by accessing the array_creator
and setting the tile property in the DomainCreator
or in individual DimensionConverter
s.
= tiledb.cf.NetCDF4ConverterEngine.from_file(netcdf_file)
converter converter
# Set tile size with domain creator
"array0").domain_creator.tiles = (50,)
converter.get_array_creator("y.data").domain_creator.tiles = (50,) converter.get_array_creator_by_attr(
# Set tile size with the dimension creators
= converter.get_array_creator("array1").domain_creator
array1_domain "x").tile = 10
array1_domain.dim_creator("y").tile = 20 array1_domain.dim_creator(
print(f"Generated TileDB Arrays:")
for array_creator in converter.array_creators():
print(
f" * {array_creator.name}({', '.join(dim_creator.name for dim_creator in array_creator.domain_creator)})"
)print(f" - tiles: {array_creator.domain_creator.tiles}")
Setting tile size during auto-generation
The tile sizes in arrays can be set when using the NetCDF4ConverterEngine
class methods from_group
and from_file
. The tile sizes can be set for the array that contains a particular variable using the tiles_by_var
parameter or for arrays defined over a particular domain using tiles_by_dims
.
How tiles_by_vars
and tiles_by_dims
interact if there is conflicting tile sizes depends on if the collect_attrs
parameter was True
or False
.
Set the array grouping. A NetCDF variable maps to TileDB attributes. The
collect_attrs
parameters determines if each NetCDF variable is stored in a separate array, or if all NetCDF variables with the same underlying dimensions are stored in the same TileDB array. Scalar variables are always grouped together.collect_attrs
: IfTrue
, store all attributes with the same dimensions in the same array. Otherwise, store each attribute in a separate array.
The
tiles_by_var
parameter is a mapping from variable name to the tiles for the dimensions of the array that variable is stored in. Thetiles_by_dims
parameter is a mapping from the names of the dimensions of the array to the tiles for the dimensions of the array. If usingcollect_attrs=True
, thentiles_by_dims
will over-writetiles_by_var
. If usingcollect_attrs=False
, thentiles_by_vars
with over-writetiles_by_var
.
# Try changing the parameters, `collect_attrs`, `tiles_by_dims`, and `tiles_by_var` and see how it effects the tile size for all dimensions
def test_setting_tiles(**kwargs):
print(f"Keyword arguments: {kwargs}")
with netCDF4.Dataset("tmp.nc", mode="w", diskless=True) as netcdf_group:
# Create a NetCDF group that only exists in memory.
"x", 100)
netcdf_group.createDimension("y", 100)
netcdf_group.createDimension("A1", np.float64, ("x", "y"))
netcdf_group.createVariable("A2", np.float64, ("x", "y"))
netcdf_group.createVariable("x", np.float64, ("x",))
netcdf_group.createVariable("y", np.float64, ("y",))
netcdf_group.createVariable(# Convert the group with the provided keyword arguments.
= tiledb.cf.NetCDF4ConverterEngine.from_group(
converter =netcdf_group, **kwargs
netcdf_group
)print(f"Generated TileDB Arrays:")
for array_creator in converter.array_creators():
print(
f" * {array_creator.name}({', '.join(dim_creator.name for dim_creator in array_creator.domain_creator)})"
)print(
f" - attributes: {', '.join(attr_creator.name for attr_creator in array_creator)}"
)print(f" - tiles: {array_creator.domain_creator.tiles}")
# 1. `collect_attrs=True`
# * `A1` and `A2` are in the same array.
# * `tile=None` for all dimensions.
=True) test_setting_tiles(collect_attrs
# 2. `collect_attrs=True`, `tiles_by_dims={(x,y): (10, 20)}`
# * `A1` and `A2` are in the same array.
# * Only array with dimensions `(x,y)` has tiles set.
=True, tiles_by_dims={("x", "y"): (10, 20)}) test_setting_tiles(collect_attrs
# 3. `collect_attrs=True`, `tiles_by_var={'A1': (50, 50)}`
# * `A1` and `A2` are in the same array.
# * Only array with variable `A1` has tiles set.
=True, tiles_by_var={"A1": (50, 50)}) test_setting_tiles(collect_attrs
# 4. `collect_attrs=True`, `tiles_by_dims={(x,y): (10, 20)}`, `tiles_by_var={'A1': (50, 50)}`
# * `A1` and `A2` are in the same array.
# * Only array with dimensions `(x,y)` has tiles set. `tiles_by_dims` took priority over `tiles_by_var`.
test_setting_tiles(=True,
collect_attrs={"A1": (50, 50)},
tiles_by_var={("x", "y"): (10, 20)},
tiles_by_dims )
# 5. `collect_attrs=False`
# * `A1` and `A2` are in separate arrays.
# * `tile=None` for all dimensions.
=False) test_setting_tiles(collect_attrs
# 6. `collect_attrs=False`, `tiles_by_dims={(x,y): (10, 20)}`
# * `A1` and `A2` are in separate arrays.
# * Only arrays with dimensions `(x,y)` have tiles set.
=False, tiles_by_dims={("x", "y"): (10, 20)}) test_setting_tiles(collect_attrs
# 7. `collect_attrs=False`, `tiles_by_var={'A1': (50, 50)}`
# * `A1` and `A2` are in separate arrays.
# * Only array with variable `A1` has tiles set.
=False, tiles_by_var={"A1": (50, 50)}) test_setting_tiles(collect_attrs
# 8. `collect_attrs=False`, `tiles_by_dims={(x,y): (10, 20)}`, `tiles_by_var={'A1': (50, 50)}`
# * `A1` and `A2` are in separate arrays.
# * The array with `A2` has tiles set by `tiles_by_dims`.
# * The array with `A1` has tiles set. `tiles_by_var` took priority over `tiles_by_dims`.
test_setting_tiles(=False,
collect_attrs={"A1": (50, 50)},
tiles_by_var={("x", "y"): (10, 20)},
tiles_by_dims )