library(tiledbsc)
#> The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
#> which was just loaded, will retire in October 2023.
#> Please refer to R-spatial evolution reports for details, especially
#> https://r-spatial.org/r/2023/05/15/evolution4.html.
#> It may be desirable to make the sf package available;
#> package maintainers should consider adding sf to Suggests:.
#> The sp package is now running under evolution status 2
#> (status 2 uses the sf package in place of rgdal)
library(SeuratObject)
#> Loading required package: spThis vignette demonstrates how to leverage tiledbsc’s features for subsetting and filtering data stored as a SOMA.
As described in ?SOMA, there are two primary approaches
for subsetting: dimension slicing and attribute filtering, both of which
are described in this vignette and are performed using the
set_query() method, provided by all tiledbsc objects.
tiledbsc objects within a SOMA are indexed by a set of common cell- or feature-identifiers, which are efficiently sliced by providing a list of dimension identifiers.
As an example, let’s ingest cell-level metadata from the
pbmc_small dataset into an AnnotationDataframe
indexed by the cell identifiers.
data("pbmc_small", package = "SeuratObject")
adf <- AnnotationDataframe$new(uri = file.path(tempdir(), "pbmc_small_obs"))
#> No AnnotationDataframe found at '/tmp/RtmpwLzq6u/pbmc_small_obs'
adf$from_dataframe(pbmc_small[[]], index_col = "obs_id")
#> Creating new AnnotationDataframe array with index [obs_id] at '/tmp/RtmpwLzq6u/pbmc_small_obs'
#> Adding 3 metadata keys to array
#> Checking legacy validity mode for array: '/tmp/RtmpwLzq6u/pbmc_small_obs'
#> Ingesting AnnotationDataframe data into: /tmp/RtmpwLzq6u/pbmc_small_obs
adf
#> <AnnotationDataframe>
#> uri: /tmp/RtmpwLzq6u/pbmc_small_obs
#> dimensions: obs_id
#> attributes: orig.ident, nCount_RNA, nFeature_RNA, RNA_snn_res.0.8, letter.idents, groups,...As you’d expect, reading the data back into memory recreates the same data frame with all 80 cells.
dim(adf$to_dataframe())
#> Reading AnnotationDataframe into memory from '/tmp/RtmpwLzq6u/pbmc_small_obs'
#> Checking legacy validity mode for array: '/tmp/RtmpwLzq6u/pbmc_small_obs'
#> [1] 80 7We can use the set_query() method to read in a specific
subset of cells by providing a named list of ranges for each
dimension.
adf$set_query(
dims = list(obs_id = c("AAATTCGAATCACG", "AAGCAAGAGCTTAG", "AAGCGACTTTGACG"))
)
#> NULLNow, the next time we call to_dataframe() the resulting
data frame will only contain the cells we specified.
adf$to_dataframe()
#> Reading AnnotationDataframe into memory from '/tmp/RtmpwLzq6u/pbmc_small_obs'
#> Checking legacy validity mode for array: '/tmp/RtmpwLzq6u/pbmc_small_obs'
#> orig.ident nCount_RNA nFeature_RNA RNA_snn_res.0.8
#> AAATTCGAATCACG SeuratProject 327 62 1
#> AAGCAAGAGCTTAG SeuratProject 126 48 0
#> AAGCGACTTTGACG SeuratProject 443 77 1
#> letter.idents groups RNA_snn_res.1
#> AAATTCGAATCACG B g2 1
#> AAGCAAGAGCTTAG A g1 0
#> AAGCGACTTTGACG B g1 1To clear this query and again read in all cells, we can use the
reset_query() method.
adf$reset_query()This functionality is also available for
AnnotationGroup-based classes, which ostensibly contain a
group of arrays that all share the same dimensions. In this case, the
dims argument is automatically applied to all child
arrays.
The SOMA class also provides a set_query()
method for the same purpose. However, unlike the lower-level classes
which are meant to be general-purpose, the SOMA class has
well-defined requirements for the dimensions of its member arrays. As
such, SOMA$set_query() provides arguments,
obs_ids and var_ids, rather than the more
general dims argument.
To see this in action, let’s create a SOMA from
pbmc_small’s RNA assay.
soma <- SOMA$new(uri = file.path(tempdir(), "pbmc_small_rna"))
#> No SOMA currently exists at '/tmp/RtmpwLzq6u/pbmc_small_rna'
#> Creating new SOMA at '/tmp/RtmpwLzq6u/pbmc_small_rna'
#> No AnnotationDataframe found at '/tmp/RtmpwLzq6u/pbmc_small_rna/obs'
#> No AnnotationDataframe found at '/tmp/RtmpwLzq6u/pbmc_small_rna/var'
#> No AssayMatrixGroup currently exists at '/tmp/RtmpwLzq6u/pbmc_small_rna/X'
#> Creating new AssayMatrixGroup at '/tmp/RtmpwLzq6u/pbmc_small_rna/X'
#> No AnnotationMatrixGroup currently exists at '/tmp/RtmpwLzq6u/pbmc_small_rna/obsm'
#> Creating new AnnotationMatrixGroup at '/tmp/RtmpwLzq6u/pbmc_small_rna/obsm'
#> No AnnotationMatrixGroup currently exists at '/tmp/RtmpwLzq6u/pbmc_small_rna/varm'
#> Creating new AnnotationMatrixGroup at '/tmp/RtmpwLzq6u/pbmc_small_rna/varm'
#> No AnnotationPairwiseMatrixGroup currently exists at '/tmp/RtmpwLzq6u/pbmc_small_rna/obsp'
#> Creating new AnnotationPairwiseMatrixGroup at '/tmp/RtmpwLzq6u/pbmc_small_rna/obsp'
#> No AnnotationPairwiseMatrixGroup currently exists at '/tmp/RtmpwLzq6u/pbmc_small_rna/varp'
#> Creating new AnnotationPairwiseMatrixGroup at '/tmp/RtmpwLzq6u/pbmc_small_rna/varp'
#> No TileDBGroup currently exists at '/tmp/RtmpwLzq6u/pbmc_small_rna/uns'
#> Creating new TileDBGroup at '/tmp/RtmpwLzq6u/pbmc_small_rna/uns'
soma$from_seurat_assay(pbmc_small[["RNA"]])
#> Creating new AnnotationDataframe array with index [obs_id] at '/tmp/RtmpwLzq6u/pbmc_small_rna/obs'
#> Adding 3 metadata keys to array
#> Checking legacy validity mode for array: '/tmp/RtmpwLzq6u/pbmc_small_rna/obs'
#> Ingesting AnnotationDataframe data into: /tmp/RtmpwLzq6u/pbmc_small_rna/obs
#> Creating new AnnotationDataframe array with index [var_id] at '/tmp/RtmpwLzq6u/pbmc_small_rna/var'
#> Adding 3 metadata keys to array
#> Checking legacy validity mode for array: '/tmp/RtmpwLzq6u/pbmc_small_rna/var'
#> Ingesting AnnotationDataframe data into: /tmp/RtmpwLzq6u/pbmc_small_rna/var
#> No AssayMatrix found at '/tmp/RtmpwLzq6u/pbmc_small_rna/X/counts'
#> Creating new AssayMatrix array with index [var_id,obs_id] at '/tmp/RtmpwLzq6u/pbmc_small_rna/X/counts'
#> Adding 3 metadata keys to array
#> Ingesting AssayMatrix data into: /tmp/RtmpwLzq6u/pbmc_small_rna/X/counts
#> No AssayMatrix found at '/tmp/RtmpwLzq6u/pbmc_small_rna/X/data'
#> Creating new AssayMatrix array with index [var_id,obs_id] at '/tmp/RtmpwLzq6u/pbmc_small_rna/X/data'
#> Adding 3 metadata keys to array
#> Ingesting AssayMatrix data into: /tmp/RtmpwLzq6u/pbmc_small_rna/X/data
#> No AssayMatrix found at '/tmp/RtmpwLzq6u/pbmc_small_rna/X/scale.data'
#> Creating new AssayMatrix array with index [var_id,obs_id] at '/tmp/RtmpwLzq6u/pbmc_small_rna/X/scale.data'
#> Adding 3 metadata keys to array
#> Ingesting AssayMatrix data into: /tmp/RtmpwLzq6u/pbmc_small_rna/X/scale.data
#> Finished converting Seurat Assay with key [rna_] to SOMANow let’s use set_query() to slice three features of
interest.
soma$set_query(var_ids = c("PPBP", "PF4", "GNLY"))Any of the individual SOMA components will respect the new query. So
if we read in the feature-level metadata from var, the
resulting data frame contains only the features we specified.
soma$var$to_dataframe()
#> Reading AnnotationDataframe into memory from '/tmp/RtmpwLzq6u/pbmc_small_rna/var'
#> Checking legacy validity mode for array: '/tmp/RtmpwLzq6u/pbmc_small_rna/var'
#> vst.mean vst.variance vst.variance.expected vst.variance.standardized
#> GNLY 2.400 43.76203 24.07857 1.817468
#> PF4 2.150 65.24304 21.48275 1.939028
#> PPBP 5.325 231.81709 93.14856 2.488681
#> vst.variable highly_variable
#> GNLY TRUE TRUE
#> PF4 TRUE TRUE
#> PPBP TRUE TRUEAnd when we recreate the entire Seurat Assay from the
SOMA, the slice is carried through to all relevant
components,
pbmc_small_rna <- soma$to_seurat_assay()
#> Reading AssayMatrix into memory from '/tmp/RtmpwLzq6u/pbmc_small_rna/X/counts'
#> Reading AssayMatrix into memory from '/tmp/RtmpwLzq6u/pbmc_small_rna/X/data'
#> Reading AssayMatrix into memory from '/tmp/RtmpwLzq6u/pbmc_small_rna/X/scale.data'
#> Reading AnnotationDataframe into memory from '/tmp/RtmpwLzq6u/pbmc_small_rna/var'
#> Checking legacy validity mode for array: '/tmp/RtmpwLzq6u/pbmc_small_rna/var'including the raw counts,
GetAssayData(pbmc_small_rna, slot = "counts")
#> 3 x 80 sparse Matrix of class "dgCMatrix"
#> [[ suppressing 80 column names 'AAGCAAGAGCTTAG', 'ACAGGTACTGGTGT', 'AGATATACCCGTAA' ... ]]
#>
#> GNLY 4 15 3 1 1 1 18 1 35 15 3 11 1 18 3 22 29 10 1 . . . . . . . . .
#> PF4 . . . . 1 . . . . . . . . . . . . . . 14 14 14 9 62 6 11 18 23
#> PPBP . . . . . . . . . . . . . . . . . . . 34 36 43 66 54 30 41 55 58
#>
#> GNLY . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
#> PF4 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
#> PPBP 1 1 6 1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
#>
#> GNLY . . . . . . . . . . . . . . .
#> PF4 . . . . . . . . . . . . . . .
#> PPBP . . . . . . . . . . . . . . .and normalized/scaled data:
GetAssayData(pbmc_small_rna, slot = "data")
#> 3 x 80 sparse Matrix of class "dgCMatrix"
#> [[ suppressing 80 column names 'AAGCAAGAGCTTAG', 'ACAGGTACTGGTGT', 'AGATATACCCGTAA' ... ]]
#>
#> GNLY 5.763498 6.902117 5.084058 3.040721 3.582547 3.160117 6.346724 4.676402
#> PF4 . . . . 3.582547 . . .
#> PPBP . . . . . . . .
#>
#> GNLY 7.26106 6.677903 6.596746 6.509859 2.657641 7.26513 4.99761 6.904224
#> PF4 . . . . . . . .
#> PPBP . . . . . . . .
#>
#> GNLY 7.528318 6.464628 4.378773 . . . . .
#> PF4 . . . 6.794273 6.726633 6.494325 6.101539 7.071124
#> PPBP . . . 7.680917 7.670362 7.615447 8.092033 6.933099
#>
#> GNLY . . . . . . . . .
#> PF4 5.993961 6.462304 6.754771 6.917480 . . . . .
#> PPBP 7.601402 7.776837 7.870948 7.841831 3.976987 3.117873 5.554937 4.753095 .
#>
#> GNLY . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
#> PF4 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
#> PPBP . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
#>
#> GNLY . . . . . . . . . .
#> PF4 . . . . . . . . . .
#> PPBP . . . . . . . . . .Again, to retrieve all of the original data we need to reset the query:
soma$reset_query()In addition to dimension slicing, set_query() allows you
to filter query results using attribute filters. As a reminder, in
TileDB parlance attributes are the non-dimensions (i.e., unindexed)
components of a dataset.
This is particularly useful for filtering SOMAs based on cell- and
feature-level metadata stored in the obs and
var arrays. For example, let’s filter out genes with a
vst.mean <= 5.
Unlink dimension slicing, attribute filters are applied immediately
to the obs or var arrays, and the identifiers
that passed the filter are used to slice the data at read time.
soma$set_query(var_attr_filter = vst.mean > 5)
#> Querying var with attribute filter
#> ...retrieved 12 var IDsWith verbose=TRUE, running the above command will print
a message indicating how many identifiers were selected. Now, when we
read the data back into memory the resulting Assay will
only include the 12 selected features.
soma$to_seurat_assay()
#> Reading AssayMatrix into memory from '/tmp/RtmpwLzq6u/pbmc_small_rna/X/counts'
#> Reading AssayMatrix into memory from '/tmp/RtmpwLzq6u/pbmc_small_rna/X/data'
#> Reading AssayMatrix into memory from '/tmp/RtmpwLzq6u/pbmc_small_rna/X/scale.data'
#> Reading AnnotationDataframe into memory from '/tmp/RtmpwLzq6u/pbmc_small_rna/var'
#> Checking legacy validity mode for array: '/tmp/RtmpwLzq6u/pbmc_small_rna/var'
#> Assay data with 12 features for 80 cells
#> Top 3 variable features:
#> HLA-DPB1, PPBP, S100A9These two filtering methods can also be combined. For example, let’s apply the same attribute filter as above but only on the subset of features previously identified as highly-variable.
vfs <- pbmc_small_rna@var.features
soma$set_query(var_ids = vfs, var_attr_filter = vst.mean > 5)
#> Querying var with attribute filter
#> ...retrieved 1 var IDsThis approach can be more performant because the filter is selectively applied to the subset of indexed identifiers.
sessionInfo()
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 22.04.3 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
#>
#> locale:
#> [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
#> [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
#> [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
#> [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] RcppSpdlog_0.0.14 SeuratObject_4.1.3 sp_2.0-0
#> [4] tiledbsc_0.1.5.9002
#>
#> loaded via a namespace (and not attached):
#> [1] sass_0.4.7 future_1.33.0 spdl_0.0.5
#> [4] stringi_1.7.12 lattice_0.21-8 listenv_0.9.0
#> [7] digest_0.6.33 magrittr_2.0.3 tiledb_0.20.3
#> [10] evaluate_0.21 grid_4.3.1 fastmap_1.1.1
#> [13] rprojroot_2.0.3 jsonlite_1.8.7 Matrix_1.5-4.1
#> [16] urltools_1.7.3 purrr_1.0.1 codetools_0.2-19
#> [19] textshaping_0.3.6 jquerylib_0.1.4 cli_3.6.1
#> [22] rlang_1.1.1 triebeard_0.4.1 parallelly_1.36.0
#> [25] future.apply_1.11.0 bit64_4.0.5 cachem_1.0.8
#> [28] yaml_2.3.7 tools_4.3.1 parallel_4.3.1
#> [31] nanotime_0.3.7 memoise_2.0.1 globals_0.16.2
#> [34] vctrs_0.6.3 R6_2.5.1 zoo_1.8-12
#> [37] lifecycle_1.0.3 stringr_1.5.0 fs_1.6.3
#> [40] bit_4.0.5 ragg_1.2.5 desc_1.4.2
#> [43] pkgdown_2.0.7 progressr_0.13.0 bslib_0.5.0
#> [46] glue_1.6.2 Rcpp_1.0.11 systemfonts_1.0.4
#> [49] xfun_0.40 knitr_1.43 htmltools_0.5.5
#> [52] rmarkdown_2.23 compiler_4.3.1 RcppCCTZ_0.2.12