Processing math: 100%

Examples of ORA analysis

lt = readRDS(system.file("extdata", "ora.rds", package = "GSEAtraining"))
bg_gene = lt$bg_gene
diff_gene = lt$diff_gene
gene_set = lt$gene_set

The lengths of the gene lists:

length(bg_gene)  # background genes
## [1] 38592
length(diff_gene)
## [1] 968
length(gene_set)
## [1] 200
length(intersect(diff_gene, gene_set))  # genes in the gene set
## [1] 14

A safer way is to make sure diff_gene and gene_set are subset of bg_genes

diff_gene = intersect(diff_gene, bg_gene)
gene_set = intersect(gene_set, bg_gene)

We fill into the 2x2 contigency table:

in the set not in the set total
DE n11 n12 n1+
not DE n21 n22 n2+
total n+1 n+2 n

Then we fill all values into the 2x2 table:

in the set not in the set total
DE 14 954 968
not DE 186 37438 37624
total 200 38392 38592

By hypergeometric distribution

All genes are separated into two distinct sets: in the gene set and not in the gene set:

1 - phyper(14-1, 200, 38392, 968)
## [1] 0.0005686084
# or
phyper(14-1, 200, 38392, 968, lower.tail = FALSE)
## [1] 0.0005686084

The separation can be performed in the other dimension: diff genes and non-diff genes:

1 - phyper(14-1, 968, 37624, 200)
## [1] 0.0005686084

By Fisher’s Exact test

fisher.test(matrix(c(14, 186, 954, 37438), nrow = 2))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(14, 186, 954, 37438), nrow = 2)
## p-value = 0.0005686
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.577790 5.106564
## sample estimates:
## odds ratio 
##   2.953612

By Binomial distribution

p = probability of a gene being diff:

1 - pbinom(14-1, 200, 968/38592)
## [1] 0.0005919725

Or p = probability of a gene being in the gene set:

1 - pbinom(14-1, 968, 200/38592)
## [1] 0.0006924557

By Chi-square test

chisq.test(matrix(c(14, 186, 954, 37438), nrow = 2), correct = FALSE)
## 
##  Pearson's Chi-squared test
## 
## data:  matrix(c(14, 186, 954, 37438), nrow = 2)
## X-squared = 16.587, df = 1, p-value = 4.647e-05

By two-sample z-test

p1 = 14/200
p2 = 954/38392
p = 968/38592

z = abs(p1 - p2)/sqrt(p*(1-p))/sqrt(1/200 + 1/38392)
2*pnorm(z, lower.tail = FALSE)
## [1] 4.647219e-05
z^2
## [1] 16.58685

We can also do in the other dimension, the tests are identical:

p1 = 14/968
p2 = 186/37624
p = 200/38592

z = abs(p1 - p2)/sqrt(p*(1-p))/sqrt(1/968 + 1/37624)
2*pnorm(z, lower.tail = FALSE)
## [1] 4.647219e-05
z^2
## [1] 16.58685

Test the speed of various tests

library(microbenchmark)

microbenchmark(
    hyper = 1 - phyper(13, 200, 38392, 968),
    fisher = fisher.test(matrix(c(14, 186, 954, 37438), nrow = 2)),
    binom = 1 - pbinom(13, 968, 200/38592),
    chisq = chisq.test(matrix(c(14, 186, 954, 37438), nrow = 2), correct = FALSE),
    ztest = {
        p1 = 14/200
        p2 = 954/38392
        p = 968/38592

        z = abs(p1 - p2)/sqrt(p*(1-p))/sqrt(1/200 + 1/38392)
        2*pnorm(z, lower.tail = FALSE)
    },
    times = 1000
)
## Unit: microseconds
##    expr     min       lq        mean    median        uq       max neval
##   hyper   1.383   2.6435    5.426053    3.8825    5.2265    98.790  1000
##  fisher 648.304 902.5435 1395.452271 1068.1695 1476.6280 15190.689  1000
##   binom   1.217   2.2630    6.272040    4.1085    6.3560   175.295  1000
##   chisq  76.833 115.6860  186.023974  141.9140  201.1945  1089.835  1000
##   ztest   2.637   4.9400   11.728395    7.8975   12.6965   353.362  1000

The effect of background size

1 - phyper(13, 200, 38392, 968)
## [1] 0.0005686084
1 - phyper(13, 200, 38392 - 10000, 968)
## [1] 0.008444096
1 - phyper(13, 200, 38392 - 20000, 968)
## [1] 0.1606062
1 - phyper(13, 200, 38392 + 10000, 968)
## [1] 5.473547e-05
1 - phyper(13, 200, 38392 + 20000, 968)
## [1] 7.140953e-06

Sensitivity of the small number of diff genes in the set

1 - phyper(13, 200, 38392, 968)
## [1] 0.0005686084
1 - phyper(13-3, 200, 38392, 968)
## [1] 0.01263274
1 - phyper(13+3, 200, 38392, 968)
## [1] 1.35041e-05