Topic 2-02: Implement ORA from stratch

Implementing ORA is very simple. As long as genes and gene sets are already in the correct ID types and formats, the implementation of ORA only needs several lines.

Three inputs for ORA:

a vector of (DE) genes: genes
a list of vectors where each vector contains genes in a gene set: gene_sets
a vector of background genes: universe

We calculate the following numbers:

Number of total genes:

n_universe = length(universe)

Number of DE genes:

n_genes = length(genes)

Sizes of gene sets:

m = sapply(gene_sets, length)

Number of DE genes in gene sets:

x = sapply(gene_sets, function(x) length(intersect(x, genes)))

We rename n_genes to k:

k = n_genes

We put these variables into the 2x2 contigency table:

	in the set	not in the set	total
DE	`x`	-	`k`
not DE	-	-	-
total	`m`	-	`n_universe`

Then we can calculate p-values from the hypergeometric distribution as:

phyper(x - 1, m, n_universe - m, k, lower.tail = FALSE)

which is the same as:

phyper(x - 1, k, n_universe - k, m, lower.tail = FALSE)

Let’s put all these code into a function:

ora = function(genes, gene_sets, universe) {
    n_universe = length(universe)
    n_genes = length(genes)
    
    x = sapply(gene_sets, function(x) length(intersect(x, genes)))
    m = sapply(gene_sets, length)
    n = n_universe - m
    k = n_genes
    
    p = phyper(x - 1, m, n, k, lower.tail = FALSE)

    data.frame(
        gene_set = names(gene_sets),
        hits = x,
        n_genes = k,
        n_gs = m,
        n_total = n_universe,
        p_value = p
    )
}

We can further improve the function by:

set the default of universe to the total genes in gene_sets
intersect genes and gene_sets to universe. Note this step also removes duplicated genes
add adjusted log2 fold enrichment and p-values
order the result table by the significance

ora = function(genes, gene_sets, universe = NULL) {

    if(is.null(universe)) {
        universe = unique(unlist(gene_sets))
    } else {
        universe = unique(universe)
    }
    # restrict in universe
    genes = intersect(genes, universe)
    gene_sets = lapply(gene_sets, function(x) intersect(x, universe))

    n_universe = length(universe)
    n_genes = length(genes)
    
    x = sapply(gene_sets, function(x) length(intersect(x, genes)))
    m = sapply(gene_sets, length)
    n = n_universe - m
    k = n_genes
    
    p = phyper(x - 1, m, n, k, lower.tail = FALSE)

    df = data.frame(
        gene_set = names(gene_sets),
        hits = x,
        n_genes = k,
        n_gs = m,
        n_total = n_universe,
        log2fe = log2(x*n_universe/k/m),
        p_value = p
    )

    df$p_adjust = p.adjust(df$p_value, "BH")
    rownames(df) = df$gene_set
    df[order(df$p_adjust, df$p_value), ,drop = FALSE]
}

Let’s try our ora() with 1000 random genes on the MSigDB hallmark gene sets. get_msigdb() is from the GSEAtraining package.

library(org.Hs.eg.db)

## Loading required package: AnnotationDbi

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
##     as.data.frame, basename, cbind, colnames, dirname, do.call,
##     duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
##     lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which.max, which.min

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: IRanges

## Loading required package: S4Vectors

## Warning: package 'S4Vectors' was built under R version 4.3.2

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     I, expand.grid, unname

##

library(GSEAtraining)

## 
## Attaching package: 'GSEAtraining'

## The following object is masked _by_ '.GlobalEnv':
## 
##     ora

gs = get_msigdb(version = "2023.2.Hs", collection = "h.all")
genes = random_genes(org.Hs.eg.db, 1000, "ENTREZID")

## 'select()' returned 1:many mapping between keys and columns

df = ora(genes, gs)
head(df)

##                                                                              gene_set
## HALLMARK_G2M_CHECKPOINT                                       HALLMARK_G2M_CHECKPOINT
## HALLMARK_WNT_BETA_CATENIN_SIGNALING               HALLMARK_WNT_BETA_CATENIN_SIGNALING
## HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION
## HALLMARK_NOTCH_SIGNALING                                     HALLMARK_NOTCH_SIGNALING
## HALLMARK_SPERMATOGENESIS                                     HALLMARK_SPERMATOGENESIS
## HALLMARK_PI3K_AKT_MTOR_SIGNALING                     HALLMARK_PI3K_AKT_MTOR_SIGNALING
##                                            hits n_genes n_gs n_total    log2fe
## HALLMARK_G2M_CHECKPOINT                      14     218  200    4384 0.4933465
## HALLMARK_WNT_BETA_CATENIN_SIGNALING           4     218   42    4384 0.9375303
## HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION   13     218  200    4384 0.3864313
## HALLMARK_NOTCH_SIGNALING                      3     218   32    4384 0.9148103
## HALLMARK_SPERMATOGENESIS                      9     218  135    4384 0.4229572
## HALLMARK_PI3K_AKT_MTOR_SIGNALING              7     218  105    4384 0.4229572
##                                              p_value  p_adjust
## HALLMARK_G2M_CHECKPOINT                    0.1207826 0.9983645
## HALLMARK_WNT_BETA_CATENIN_SIGNALING        0.1538410 0.9983645
## HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 0.1936104 0.9983645
## HALLMARK_NOTCH_SIGNALING                   0.2111459 0.9983645
## HALLMARK_SPERMATOGENESIS                   0.2275209 0.9983645
## HALLMARK_PI3K_AKT_MTOR_SIGNALING           0.2661692 0.9983645

The only restriction of ora() is the ID type for genes should be the same as in gene_sets.

Practice

Practice 1

The file de.rds contains results from a differential expression (DE) analysis.

de = readRDS(system.file("extdata", "de.rds", package = "GSEAtraining"))
de = de[, c("symbol", "p_value")]
de = de[!is.na(de$p_value), ]
head(de)

##     symbol      p_value
## 1   TSPAN6 0.0081373337
## 2     TNMD 0.0085126916
## 3     DPM1 0.9874656931
## 4    SCYL3 0.2401551938
## 5 C1orf112 0.8884603214
## 6      FGR 0.0006134791

Take the symbol and p_value columns, set cutoff of p_value to 0.05, 0.01 and 0.001 respectively to filter the significant DE genes (of course, in applications, you need to correct p-values), apply ora() to the three DE gene lists using the GO BP gene sets (think how to obtain the GO gene sets from org.Hs.eg.db) and compare the ORA results (the whatever comparison you can think of).

If you EntreZ ID in the gene sets, you need to convert gene symbols for the DE genes to EntreZ IDs. But you can also choose to use gene symbols in the gene sets.

Solution

The three DE gene lists:

de_genes_1 = de$symbol[de$p_value < 0.05]
de_genes_2 = de$symbol[de$p_value < 0.01]
de_genes_3 = de$symbol[de$p_value < 0.001]

length(de_genes_1)

## [1] 3156

length(de_genes_2)

## [1] 1708

length(de_genes_3)

## [1] 817

Get the GO BP gene sets. Note genes in de_genes_* are symbols, thus genes in gene sets should also be symbols.

gs = get_GO_gene_sets_from_orgdb(org.Hs.eg.db, "BP", gene_id_type = "SYMBOL")

## 'select()' returned 1:many mapping between keys and columns

gs[1]

## $`GO:0000002`
##  [1] "PARP1"    "SLC25A4"  "DNA2"     "TYMP"     "ENDOG"    "LIG3"    
##  [7] "MEF2A"    "MPV17"    "OPA1"     "POLG"     "RRM1"     "SSBP1"   
## [13] "TOP3A"    "TP53"     "DNAJA3"   "LONP1"    "AKT3"     "PPARGC1A"
## [19] "POLG2"    "RRM2B"    "SLC25A36" "TWNK"     "METTL4"   "PIF1"    
## [25] "SESN2"    "SLC25A33" "MGME1"    "FLCN"     "PRIMPOL"  "STOX1"

We perform ORA for the three DE gene lists.

tb1 = ora(de_genes_1, gs)
tb2 = ora(de_genes_2, gs)
tb3 = ora(de_genes_3, gs)

Compare number of significant GO terms:

sum(tb1$p_adjust < 0.05)

## [1] 1025

sum(tb2$p_adjust < 0.05)

## [1] 871

sum(tb3$p_adjust < 0.05)

## [1] 788

Actually we can see gene set enrichment analysis is a very robust method which is not sensitive to DE cutoffs (at least for this dataset).

Make a Venn (Euler) diagram:

library(eulerr)
plot(euler(list("DE_cutoff_0.05"  = tb1$gene_set[tb1$p_adjust < 0.05],
                "DE_cutoff_0.01"  = tb2$gene_set[tb2$p_adjust < 0.05],
                "DE_cutoff_0.001" = tb3$gene_set[tb3$p_adjust < 0.05])), 
     quantities = TRUE)

Compare log2 fold enrichment from the three ORA results. First take the common GO terms in the three ORA tables.

cn = intersect(intersect(rownames(tb1), rownames(tb2)), rownames(tb3))

Then simply make pairwise scatter plots:

par(mfrow = c(1, 3))
plot(tb1[cn, "log2fe"], tb2[cn, "log2fe"], pch = 16, cex = 0.5, col = "#00000080",
    xlim = c(-5, 5), ylim = c(-5, 5), main = "compare log2 fold enrichment from ORA",
    xlab = "DE_cutoff_0.05", ylab = "DE_cutoff_0.01")
abline(a = 0, b = 1, lty = 2, col = "red")

plot(tb1[cn, "log2fe"], tb3[cn, "log2fe"], pch = 16, cex = 0.5, col = "#00000080",
    xlim = c(-5, 5), ylim = c(-5, 5), main = "compare log2 fold enrichment from ORA",
    xlab = "DE_cutoff_0.05", ylab = "DE_cutoff_0.001")
abline(a = 0, b = 1, lty = 2, col = "red")

plot(tb2[cn, "log2fe"], tb3[cn, "log2fe"], pch = 16, cex = 0.5, col = "#00000080",
    xlim = c(-5, 5), ylim = c(-5, 5), main = "compare log2 fold enrichment from ORA",
    xlab = "DE_cutoff_0.01", ylab = "DE_cutoff_0.001")
abline(a = 0, b = 1, lty = 2, col = "red")

Zuguang Gu z.gu@dkfz.de

2024-01-26

Practice

Practice 1

Solution