Skip to contents

Examples of ORA analysis

lt = readRDS(system.file("extdata", "ora.rds", package = "GSEAtopics"))
bg_gene = lt$bg_gene
diff_gene = lt$diff_gene
gene_set = lt$gene_set

The lengths of the gene lists:

length(bg_gene)  # background genes
## [1] 38592
length(diff_gene)
## [1] 968
length(gene_set)
## [1] 200
length(intersect(diff_gene, gene_set))  # genes in the gene set
## [1] 14

A safer way is to make sure diff_gene and gene_set are subset of bg_genes

diff_gene = intersect(diff_gene, bg_gene)
gene_set = intersect(gene_set, bg_gene)

We fill into the 2x2 contigency table:

in the set not in the set total
DE n11n_{11} n12n_{12} n1+n_{1+}
not DE n21n_{21} n22n_{22} n2+n_{2+}
total n+1n_{+1} n+2n_{+2} nn

Then we fill all values into the 2x2 table:

in the set not in the set total
DE 14 954 968
not DE 186 37438 37624
total 200 38392 38592

By hypergeometric distribution

All genes are separated into two distinct sets: in the gene set and not in the gene set:

1 - phyper(14-1, 200, 38392, 968)
## [1] 0.0005686084
# or
phyper(14-1, 200, 38392, 968, lower.tail = FALSE)
## [1] 0.0005686084

The separation can be performed in the other dimension: diff genes and non-diff genes:

1 - phyper(14-1, 968, 37624, 200)
## [1] 0.0005686084

By Fisher’s Exact test

fisher.test(matrix(c(14, 186, 954, 37438), nrow = 2))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(14, 186, 954, 37438), nrow = 2)
## p-value = 0.0005686
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.577790 5.106564
## sample estimates:
## odds ratio 
##   2.953612

By Binomial distribution

p = probability of a gene being diff:

1 - pbinom(14-1, 200, 968/38592)
## [1] 0.0005919725

Or p = probability of a gene being in the gene set:

1 - pbinom(14-1, 968, 200/38592)
## [1] 0.0006924557

By Chi-square test

chisq.test(matrix(c(14, 186, 954, 37438), nrow = 2), correct = FALSE)
## 
##  Pearson's Chi-squared test
## 
## data:  matrix(c(14, 186, 954, 37438), nrow = 2)
## X-squared = 16.587, df = 1, p-value = 4.647e-05

By two-sample z-test

p1 = 14/200
p2 = 954/38392
p = 968/38592

z = abs(p1 - p2)/sqrt(p*(1-p))/sqrt(1/200 + 1/38392)
2*pnorm(z, lower.tail = FALSE)
## [1] 4.647219e-05
z^2
## [1] 16.58685

We can also do in the other dimension, the tests are identical:

p1 = 14/968
p2 = 186/37624
p = 200/38592

z = abs(p1 - p2)/sqrt(p*(1-p))/sqrt(1/968 + 1/37624)
2*pnorm(z, lower.tail = FALSE)
## [1] 4.647219e-05
z^2
## [1] 16.58685

Test the speed of various tests

library(microbenchmark)

microbenchmark(
    hyper = 1 - phyper(13, 200, 38392, 968),
    fisher = fisher.test(matrix(c(14, 186, 954, 37438), nrow = 2)),
    binom = 1 - pbinom(13, 968, 200/38592),
    chisq = chisq.test(matrix(c(14, 186, 954, 37438), nrow = 2), correct = FALSE),
    ztest = {
        p1 = 14/200
        p2 = 954/38392
        p = 968/38592

        z = abs(p1 - p2)/sqrt(p*(1-p))/sqrt(1/200 + 1/38392)
        2*pnorm(z, lower.tail = FALSE)
    },
    times = 1000
)
## Unit: nanoseconds
##    expr    min       lq       mean median     uq     max neval
##   hyper    410    574.0    628.120    615    697    3362  1000
##  fisher 195160 213302.5 240874.221 221113 230543 3158804  1000
##   binom    369    492.0    593.229    615    697    2091  1000
##   chisq  20910  22714.0  24394.959  24190  25584   45264  1000
##   ztest    779   1025.0   1316.879   1394   1517    4551  1000

The effect of background size

1 - phyper(13, 200, 38392, 968)
## [1] 0.0005686084
1 - phyper(13, 200, 38392 - 10000, 968)
## [1] 0.008444096
1 - phyper(13, 200, 38392 - 20000, 968)
## [1] 0.1606062
1 - phyper(13, 200, 38392 + 10000, 968)
## [1] 5.473547e-05
1 - phyper(13, 200, 38392 + 20000, 968)
## [1] 7.140953e-06

Sensitivity of the small number of diff genes in the set

1 - phyper(13, 200, 38392, 968)
## [1] 0.0005686084
1 - phyper(13-3, 200, 38392, 968)
## [1] 0.01263274
1 - phyper(13+3, 200, 38392, 968)
## [1] 1.35041e-05