import tiledbvcf
import tiledb
TileDB-VCF Allele Frequencies
This notebook will walk you through
- Selecting allele frequencies
- Filtering on allele frequencies
- Direct allele frequency access
This notebook uses the TileDB-Inc/vcf-1kghicov-dragen-v376 version of the 1000 genome project.
= "tiledb://TileDB-Inc/vcf-1kghicov-dragen-v376" uri
Including Allele Frequency Field
The computed allele frequency can be included in results with the info_TILEDB_IAF
attribute
= ["sample_name", "contig", "pos_start", "pos_end", "alleles", "fmt_GT", "info_TILEDB_IAF"]
attributes
= tiledbvcf.Dataset(uri, mode="r") ds
# Query for BTD across all 3404 samples
= ds.read(
df =attributes,
attrs=["chr3:15601341-15722311"],
regions )
df
sample_name | contig | pos_start | pos_end | alleles | fmt_GT | info_TILEDB_IAF | |
---|---|---|---|---|---|---|---|
0 | HG00096 | chr3 | 15601536 | 15601536 | [A, G] | [1, 1] | None |
1 | HG00097 | chr3 | 15601536 | 15601536 | [A, G] | [1, 1] | None |
2 | HG00099 | chr3 | 15601536 | 15601536 | [A, G] | [1, 1] | None |
3 | HG00100 | chr3 | 15601536 | 15601536 | [A, G] | [1, 1] | None |
4 | HG00101 | chr3 | 15601536 | 15601536 | [A, G] | [0, 1] | None |
... | ... | ... | ... | ... | ... | ... | ... |
606391 | NA21143 | chr3 | 15721413 | 15721413 | [C, T] | [0, 1] | None |
606392 | NA21144 | chr3 | 15721413 | 15721413 | [C, T] | [1, 1] | None |
606393 | NA21144 | chr3 | 15721470 | 15721470 | [T, C] | [0, 1] | None |
606394 | NA21143 | chr3 | 15721933 | 15721933 | [C, T] | [0, 1] | None |
606395 | NA21144 | chr3 | 15722066 | 15722066 | [A, G] | [0, 1] | None |
606396 rows × 7 columns
Filtering on IAF
Filters for allele frequency can also be included by using the set_af_filter
parameter.
# Query for BTD across all 3404 samples
= ds.read(
df =attributes,
attrs=["chr3:15601341-15722311"],
regions="<0.5",
set_af_filter )
df
sample_name | contig | pos_start | pos_end | alleles | fmt_GT | info_TILEDB_IAF | |
---|---|---|---|---|---|---|---|
0 | HG00101 | chr3 | 15601536 | 15601536 | [A, G] | [0, 1] | [0.027682202, 0.9723178] |
1 | HG00100 | chr3 | 15601668 | 15601668 | [G, A] | [0, 1] | [0.43612567, 0.56387436] |
2 | HG00100 | chr3 | 15602568 | 15602568 | [A, G] | [0, 1] | [0.42662117, 0.57337886] |
3 | HG00100 | chr3 | 15602688 | 15602688 | [G, A] | [0, 1] | [0.47108433, 0.52891564] |
4 | HG00096 | chr3 | 15603161 | 15603161 | [A, G] | [0, 1] | [0.44444445, 0.5555556] |
... | ... | ... | ... | ... | ... | ... | ... |
370583 | NA21144 | chr3 | 15721189 | 15721189 | [C, G] | [0, 1] | [0.43555242, 0.5644476] |
370584 | NA21143 | chr3 | 15721340 | 15721340 | [G, A] | [0, 1] | [0.3335806, 0.6664194] |
370585 | NA21143 | chr3 | 15721413 | 15721413 | [C, T] | [0, 1] | [0.33333334, 0.6666667] |
370586 | NA21144 | chr3 | 15721470 | 15721470 | [T, C] | [0, 1] | [0.4868421, 0.5131579] |
370587 | NA21144 | chr3 | 15722066 | 15722066 | [A, G] | [0, 1] | [0.4359155, 0.56408453] |
370588 rows × 7 columns
Accessing Allele Frequencies Directly
Allele frequencies can also be queried directly in addition to as part of the variant query
# Get the variant stats ur
with tiledb.Group(uri) as g:
= g["variant_stats"].uri alleles_uri
# BRCA1 from https://www.ncbi.nlm.nih.gov/gene/672
= 'chr17'
contig = slice(43_036_174, 43_133_600) # pos is 0 based region
# Query allele frequencies and get results as a pandas dataframe
with tiledb.open(alleles_uri) as A:
= A.query(attrs=["ac", "allele"], dims=["pos", "contig"]).df[contig, region] df
# Summarize frequencies
def calc_af(df):
"""Consolidate AC and compute AN, AF"""
# Allele Count (AC) = sum of all AC at the same locus
# This step consolidates ACs from all ingested batches
= df.groupby(["pos", "allele"], sort=True).sum()
df
# Allele Number (AN) = sum of AC at the same locus
= df.groupby(["pos"], sort=True).ac.sum().rename("an")
an = df.join(an, how="inner")
df
# Allele Frequency (AF) = AC / AN
"af"] = df.ac / df.an
df[return df
calc_af(df)
ac | an | af | ||
---|---|---|---|---|
pos | allele | |||
43036179 | A | 33 | 66 | 0.500000 |
G | 33 | 66 | 0.500000 | |
43036181 | A | 1 | 2 | 0.500000 |
G | 1 | 2 | 0.500000 | |
43036308 | C | 238 | 528 | 0.450758 |
... | ... | ... | ... | ... |
43133528 | C | 5 | 12 | 0.416667 |
T | 6 | 12 | 0.500000 | |
TC | 1 | 12 | 0.083333 | |
43133556 | C | 1 | 2 | 0.500000 |
T | 1 | 2 | 0.500000 |
8292 rows × 3 columns