import tiledbvcf
import tiledbTileDB-VCF Allele Frequencies
This notebook will walk you through
- Selecting allele frequencies
- Filtering on allele frequencies
- Direct allele frequency access
This notebook uses the TileDB-Inc/vcf-1kghicov-dragen-v376 version of the 1000 genome project.
uri = "tiledb://TileDB-Inc/vcf-1kghicov-dragen-v376"Including Allele Frequency Field
The computed allele frequency can be included in results with the info_TILEDB_IAF attribute
attributes = ["sample_name", "contig", "pos_start", "pos_end", "alleles", "fmt_GT", "info_TILEDB_IAF"]
ds = tiledbvcf.Dataset(uri, mode="r")# Query for BTD across all 3404 samples
df = ds.read(
    attrs=attributes,
    regions=["chr3:15601341-15722311"],
)df| sample_name | contig | pos_start | pos_end | alleles | fmt_GT | info_TILEDB_IAF | |
|---|---|---|---|---|---|---|---|
| 0 | HG00096 | chr3 | 15601536 | 15601536 | [A, G] | [1, 1] | None | 
| 1 | HG00097 | chr3 | 15601536 | 15601536 | [A, G] | [1, 1] | None | 
| 2 | HG00099 | chr3 | 15601536 | 15601536 | [A, G] | [1, 1] | None | 
| 3 | HG00100 | chr3 | 15601536 | 15601536 | [A, G] | [1, 1] | None | 
| 4 | HG00101 | chr3 | 15601536 | 15601536 | [A, G] | [0, 1] | None | 
| ... | ... | ... | ... | ... | ... | ... | ... | 
| 606391 | NA21143 | chr3 | 15721413 | 15721413 | [C, T] | [0, 1] | None | 
| 606392 | NA21144 | chr3 | 15721413 | 15721413 | [C, T] | [1, 1] | None | 
| 606393 | NA21144 | chr3 | 15721470 | 15721470 | [T, C] | [0, 1] | None | 
| 606394 | NA21143 | chr3 | 15721933 | 15721933 | [C, T] | [0, 1] | None | 
| 606395 | NA21144 | chr3 | 15722066 | 15722066 | [A, G] | [0, 1] | None | 
606396 rows × 7 columns
Filtering on IAF
Filters for allele frequency can also be included by using the set_af_filter parameter.
# Query for BTD across all 3404 samples
df = ds.read(
    attrs=attributes,
    regions=["chr3:15601341-15722311"],
    set_af_filter="<0.5",
)df| sample_name | contig | pos_start | pos_end | alleles | fmt_GT | info_TILEDB_IAF | |
|---|---|---|---|---|---|---|---|
| 0 | HG00101 | chr3 | 15601536 | 15601536 | [A, G] | [0, 1] | [0.027682202, 0.9723178] | 
| 1 | HG00100 | chr3 | 15601668 | 15601668 | [G, A] | [0, 1] | [0.43612567, 0.56387436] | 
| 2 | HG00100 | chr3 | 15602568 | 15602568 | [A, G] | [0, 1] | [0.42662117, 0.57337886] | 
| 3 | HG00100 | chr3 | 15602688 | 15602688 | [G, A] | [0, 1] | [0.47108433, 0.52891564] | 
| 4 | HG00096 | chr3 | 15603161 | 15603161 | [A, G] | [0, 1] | [0.44444445, 0.5555556] | 
| ... | ... | ... | ... | ... | ... | ... | ... | 
| 370583 | NA21144 | chr3 | 15721189 | 15721189 | [C, G] | [0, 1] | [0.43555242, 0.5644476] | 
| 370584 | NA21143 | chr3 | 15721340 | 15721340 | [G, A] | [0, 1] | [0.3335806, 0.6664194] | 
| 370585 | NA21143 | chr3 | 15721413 | 15721413 | [C, T] | [0, 1] | [0.33333334, 0.6666667] | 
| 370586 | NA21144 | chr3 | 15721470 | 15721470 | [T, C] | [0, 1] | [0.4868421, 0.5131579] | 
| 370587 | NA21144 | chr3 | 15722066 | 15722066 | [A, G] | [0, 1] | [0.4359155, 0.56408453] | 
370588 rows × 7 columns
Accessing Allele Frequencies Directly
Allele frequencies can also be queried directly in addition to as part of the variant query
# Get the variant stats ur
with tiledb.Group(uri) as g:
    alleles_uri  = g["variant_stats"].uri# BRCA1 from https://www.ncbi.nlm.nih.gov/gene/672
contig = 'chr17'
region = slice(43_036_174, 43_133_600)  # pos is 0 based# Query allele frequencies and get results as a pandas dataframe
with tiledb.open(alleles_uri) as A:
    df = A.query(attrs=["ac", "allele"], dims=["pos", "contig"]).df[contig, region]# Summarize frequencies
def calc_af(df):
    """Consolidate AC and compute AN, AF"""
    # Allele Count (AC) = sum of all AC at the same locus
    # This step consolidates ACs from all ingested batches
    df = df.groupby(["pos", "allele"], sort=True).sum()
    # Allele Number (AN) = sum of AC at the same locus
    an = df.groupby(["pos"], sort=True).ac.sum().rename("an") 
    df = df.join(an, how="inner")
    
    # Allele Frequency (AF) = AC / AN
    df["af"] = df.ac / df.an
    return df
calc_af(df)| ac | an | af | ||
|---|---|---|---|---|
| pos | allele | |||
| 43036179 | A | 33 | 66 | 0.500000 | 
| G | 33 | 66 | 0.500000 | |
| 43036181 | A | 1 | 2 | 0.500000 | 
| G | 1 | 2 | 0.500000 | |
| 43036308 | C | 238 | 528 | 0.450758 | 
| ... | ... | ... | ... | ... | 
| 43133528 | C | 5 | 12 | 0.416667 | 
| T | 6 | 12 | 0.500000 | |
| TC | 1 | 12 | 0.083333 | |
| 43133556 | C | 1 | 2 | 0.500000 | 
| T | 1 | 2 | 0.500000 | 
8292 rows × 3 columns