great() supports various sources to obtain gene TSS. The sources can be one of:

In this document, we will compare different TSS sources and their influence on GREAT enrichment analysis.

We use human genome hg38 here because there will be gene ID conversions (e.g. from Ensembl ID to Entrez ID for Gencode annotation), using the newest genome annotation version will reduce the inconsistency between different sources.

The helper function getTSS() extracts TSS from a specific source. Note genes in the four sources are protein-coding genes.

library(rGREAT)

tss_txdb = getTSS("TxDb.Hsapiens.UCSC.hg38.knownGene")
tss_gencode = getTSS("gencode_v40")
tss_refseq = getTSS("refseq:hg38")
tss_great = getTSS("great:hg38")

Gene IDs in tss_gencode are Ensembl gene IDs, and gene IDs in tss_great are gene symbols. We convert them to Entrez gene IDs.

library(org.Hs.eg.db)

map = unlist(as.list(org.Hs.egENSEMBL2EG))
new_gene_id = map[tss_gencode$gene_id]
tss_gencode$gene_id[!is.na(new_gene_id)] = new_gene_id[!is.na(new_gene_id)]

map = unlist(as.list(org.Hs.egSYMBOL2EG))
new_gene_id = map[tss_great$gene_id]
tss_great$gene_id[!is.na(new_gene_id)] = new_gene_id[!is.na(new_gene_id)]

We put all TSS objects into a single list:

tss_lt = list(
    txdb_known_gene = tss_txdb,
    gencode = tss_gencode,
    refseq = tss_refseq,
    great = tss_great
)
tss_lt = lapply(tss_lt, sort)

We first look at the overlap of genes. It basically shows all five sources almost contain the same set of genes.

library(ComplexHeatmap)
lt = lapply(tss_lt, function(x) {
    unique(x$gene_id)
})
cm = make_comb_mat(lt)
UpSet(cm, column_title = "Number of genes")

Next we look at the overlap of TSS with their exact positions. There are quite a large disagreement between different TSS sources.

lt = lapply(tss_lt, function(x) {
    unique(paste0(strand(x), seqnames(x), ":", start(x)))
})
cm = make_comb_mat(lt)
UpSet(cm, column_title = "Number of TSS (with their exact positions)")

Next we compare difference of TSS locations in different sources. We first take the common genes in all four sources.

tss_lt2 = lapply(tss_lt, function(x) {
    tb = table(x$gene_id)
    dp = names(tb[which(tb == 1)])
    x = x[x$gene_id %in% dp]
    names(x) = x$gene_id
    x
})
cn = tss_lt2[[1]]$gene_id
for(i in 2:length(tss_lt2)) {
    cn = intersect(cn, tss_lt2[[i]]$gene_id)
}
length(cn)
## [1] 17113
tss_lt2 = lapply(tss_lt2, function(x) x[cn])

Next we perform pairwise comparisons for every pair of TSS sources.

library(GetoptLong)
compare_tss_pos = function(tss1, tss2, name1, name2, ...) {
    d1 = start(tss1)
    d2 = start(tss2)

    diff = abs(d1 - d2)
    
    v = numeric()
    v["0"] = sum(diff == 0)
    v["1-5"] = sum(diff >= 1 & diff <= 5)
    v["6-10"] = sum(diff >= 6 & diff <= 10)
    v["11-50"] = sum(diff >= 11 & diff <= 50)
    v["51-500"] = sum(diff >= 51 & diff <= 500)
    v["501-5kb"] = sum(diff >= 501 & diff <= 5000)
    v["5kb-50kb"] = sum(diff >= 5001 & diff <= 50000)
    v[">50kb"] = sum(diff >= 50001)

    barplot(v, ylab = "Number of TSS", 
        main = qq("TSS dist_diff, @{name1} and @{name2}\nmean (trim 0.05) = @{round(mean(diff, trim = 0.05))}bp, median = @{median(diff)}bp"), 
        las = 3, ...)
}

par(mfrow = c(4, 4))
for(i in 1:4) {
    for(j in 1:4) {
        if(i == j) {
            plot(c(0, 1), c(0, 1), type = "n", axes = FALSE, ann = FALSE)
            text(0.5, 0.5, names(tss_lt2)[i], cex = 1.5)
        } else {
            compare_tss_pos(tss_lt2[[j]], tss_lt2[[i]], names(tss_lt2)[j], names(tss_lt2)[i], ylim = c(0, 16000))
        }
    }
}