import netCDF4
import numpy as np
import tiledb
import tiledb.cfNetCDF-To-TileDB: How to set tile sizes
About this Example
What it Shows
This example shows how to set the tile size for TileDB arrays that will be created by a NetCDF4ConverterEngine.
Example dataset
This example shows convertering a small NetCDF file with 2 dimensions and 4 variables:
- Dimensions:
- x: size=100
- y: size=100
- Variables:
- x(x)
- y(y)
- A1(x, y)
- A1(x, y)
Set-up Requirements
This example requires the following python packages are installed: netCDF4, numpy, tiledb, tiledb-cf
# Set names for the output generated by the example.
output_dir = "output/netcdf-to-tiledb-set-filters"
netcdf_file = f"{output_dir}/simple1.nc"# Reset output folder
import os
import shutil
shutil.rmtree(output_dir, ignore_errors=True)
os.mkdir(output_dir)# Create sample NetCDF file with no data
with netCDF4.Dataset(netcdf_file, mode="w") as dataset:
dataset.createDimension("x", 100)
dataset.createDimension("y", 100)
A1 = dataset.createVariable("A1", np.float64, ("x", "y"))
A2 = dataset.createVariable("A2", np.float64, ("x", "y"))
x1 = dataset.createVariable("x", np.float64, ("x",))
y = dataset.createVariable("y", np.float64, ("y",))
print(f"Created example NetCDF file `{netcdf_file}`.")Setting tile size after auto-generation
When using the NetCDF4ConverterEngine class, the most straight forward way to set tile sizes is by directly setting them after the arrays have been added to the converter engine. This can be done by accessing the array_creator and setting the tile property in the DomainCreator or in individual DimensionConverters.
converter = tiledb.cf.NetCDF4ConverterEngine.from_file(netcdf_file)
converter# Set tile size with domain creator
converter.get_array_creator("array0").domain_creator.tiles = (50,)
converter.get_array_creator_by_attr("y.data").domain_creator.tiles = (50,)# Set tile size with the dimension creators
array1_domain = converter.get_array_creator("array1").domain_creator
array1_domain.dim_creator("x").tile = 10
array1_domain.dim_creator("y").tile = 20print(f"Generated TileDB Arrays:")
for array_creator in converter.array_creators():
print(
f" * {array_creator.name}({', '.join(dim_creator.name for dim_creator in array_creator.domain_creator)})"
)
print(f" - tiles: {array_creator.domain_creator.tiles}")Setting tile size during auto-generation
The tile sizes in arrays can be set when using the NetCDF4ConverterEngine class methods from_group and from_file. The tile sizes can be set for the array that contains a particular variable using the tiles_by_var parameter or for arrays defined over a particular domain using tiles_by_dims.
How tiles_by_vars and tiles_by_dims interact if there is conflicting tile sizes depends on if the collect_attrs parameter was True or False.
Set the array grouping. A NetCDF variable maps to TileDB attributes. The
collect_attrsparameters determines if each NetCDF variable is stored in a separate array, or if all NetCDF variables with the same underlying dimensions are stored in the same TileDB array. Scalar variables are always grouped together.collect_attrs: IfTrue, store all attributes with the same dimensions in the same array. Otherwise, store each attribute in a separate array.
The
tiles_by_varparameter is a mapping from variable name to the tiles for the dimensions of the array that variable is stored in. Thetiles_by_dimsparameter is a mapping from the names of the dimensions of the array to the tiles for the dimensions of the array. If usingcollect_attrs=True, thentiles_by_dimswill over-writetiles_by_var. If usingcollect_attrs=False, thentiles_by_varswith over-writetiles_by_var.
# Try changing the parameters, `collect_attrs`, `tiles_by_dims`, and `tiles_by_var` and see how it effects the tile size for all dimensions
def test_setting_tiles(**kwargs):
print(f"Keyword arguments: {kwargs}")
with netCDF4.Dataset("tmp.nc", mode="w", diskless=True) as netcdf_group:
# Create a NetCDF group that only exists in memory.
netcdf_group.createDimension("x", 100)
netcdf_group.createDimension("y", 100)
netcdf_group.createVariable("A1", np.float64, ("x", "y"))
netcdf_group.createVariable("A2", np.float64, ("x", "y"))
netcdf_group.createVariable("x", np.float64, ("x",))
netcdf_group.createVariable("y", np.float64, ("y",))
# Convert the group with the provided keyword arguments.
converter = tiledb.cf.NetCDF4ConverterEngine.from_group(
netcdf_group=netcdf_group, **kwargs
)
print(f"Generated TileDB Arrays:")
for array_creator in converter.array_creators():
print(
f" * {array_creator.name}({', '.join(dim_creator.name for dim_creator in array_creator.domain_creator)})"
)
print(
f" - attributes: {', '.join(attr_creator.name for attr_creator in array_creator)}"
)
print(f" - tiles: {array_creator.domain_creator.tiles}")# 1. `collect_attrs=True`
# * `A1` and `A2` are in the same array.
# * `tile=None` for all dimensions.
test_setting_tiles(collect_attrs=True)# 2. `collect_attrs=True`, `tiles_by_dims={(x,y): (10, 20)}`
# * `A1` and `A2` are in the same array.
# * Only array with dimensions `(x,y)` has tiles set.
test_setting_tiles(collect_attrs=True, tiles_by_dims={("x", "y"): (10, 20)})# 3. `collect_attrs=True`, `tiles_by_var={'A1': (50, 50)}`
# * `A1` and `A2` are in the same array.
# * Only array with variable `A1` has tiles set.
test_setting_tiles(collect_attrs=True, tiles_by_var={"A1": (50, 50)})# 4. `collect_attrs=True`, `tiles_by_dims={(x,y): (10, 20)}`, `tiles_by_var={'A1': (50, 50)}`
# * `A1` and `A2` are in the same array.
# * Only array with dimensions `(x,y)` has tiles set. `tiles_by_dims` took priority over `tiles_by_var`.
test_setting_tiles(
collect_attrs=True,
tiles_by_var={"A1": (50, 50)},
tiles_by_dims={("x", "y"): (10, 20)},
)# 5. `collect_attrs=False`
# * `A1` and `A2` are in separate arrays.
# * `tile=None` for all dimensions.
test_setting_tiles(collect_attrs=False)# 6. `collect_attrs=False`, `tiles_by_dims={(x,y): (10, 20)}`
# * `A1` and `A2` are in separate arrays.
# * Only arrays with dimensions `(x,y)` have tiles set.
test_setting_tiles(collect_attrs=False, tiles_by_dims={("x", "y"): (10, 20)})# 7. `collect_attrs=False`, `tiles_by_var={'A1': (50, 50)}`
# * `A1` and `A2` are in separate arrays.
# * Only array with variable `A1` has tiles set.
test_setting_tiles(collect_attrs=False, tiles_by_var={"A1": (50, 50)})# 8. `collect_attrs=False`, `tiles_by_dims={(x,y): (10, 20)}`, `tiles_by_var={'A1': (50, 50)}`
# * `A1` and `A2` are in separate arrays.
# * The array with `A2` has tiles set by `tiles_by_dims`.
# * The array with `A1` has tiles set. `tiles_by_var` took priority over `tiles_by_dims`.
test_setting_tiles(
collect_attrs=False,
tiles_by_var={"A1": (50, 50)},
tiles_by_dims={("x", "y"): (10, 20)},
)