To implement an efficient visualization for analyzing dependencies of R packages.
From the aspect of software engineering, dependencies \(\approx\) risks, i.e., adding more dependencies imports more risks. There are two view angles for looking at dependencies.
First, from the perspective of the whole ecosystem, hub packages such as ggplot2 and Rcpp are more risky packages.
Second, from the perspective of an R package developer, a parent package \(P\) is more risky if it imports more upstream dependencies.
We look at from the second view angle, as we developers cannot change the whole software ecosystem, but we can decide which parent package to use.
Now the question, how can we identify the most risky parent and how such risky parent affects your package, via a heatmap visualization?
In the analysis, we take the ComplexHeatmap package
as an example. tools::package_dependencies() retrieves
package dependencies from CRAN/Bioconductor repositories.
setRepositories(ind = 1:4) ## 2:4 corresponds to the three bioc repos
library(tools)
package_dependencies("ComplexHeatmap") ## default is strong dependencies
## $ComplexHeatmap
## [1] "methods" "grid" "graphics" "stats"
## [5] "grDevices" "circlize" "GetoptLong" "colorspace"
## [9] "clue" "RColorBrewer" "GlobalOptions" "png"
## [13] "digest" "IRanges" "matrixStats" "foreach"
## [17] "doParallel" "codetools"
package_dependencies("ComplexHeatmap", which = "Depends")
## $ComplexHeatmap
## [1] "methods" "grid" "graphics" "stats" "grDevices"
package_dependencies("ComplexHeatmap", which = "Imports")
## $ComplexHeatmap
## [1] "circlize" "GetoptLong" "colorspace" "clue"
## [5] "RColorBrewer" "GlobalOptions" "png" "digest"
## [9] "IRanges" "matrixStats" "foreach" "doParallel"
## [13] "codetools"
package_dependencies("ComplexHeatmap", which = "LinkingTo")
## $ComplexHeatmap
## character(0)
package_dependencies("ComplexHeatmap", which = "Suggests")
## $ComplexHeatmap
## [1] "testthat" "knitr" "markdown" "dendsort"
## [5] "jpeg" "tiff" "fastcluster" "EnrichedHeatmap"
## [9] "dendextend" "grImport" "grImport2" "glue"
## [13] "GenomicRanges" "gridtext" "pheatmap" "gridGraphics"
## [17] "gplots" "rmarkdown" "Cairo" "magick"
package_dependencies("ComplexHeatmap", which = "Enhances")
## $ComplexHeatmap
## character(0)
The direct dependencies (i.e., parent dependencies) can also be
obtained by parsing the DESCRIPTION file, using
read.dcf() function.
read.dcf(system.file(package = "ComplexHeatmap", "DESCRIPTION"))
read.dcf(url("https://raw.githubusercontent.com/jokergoo/ComplexHeatmap/refs/heads/master/DESCRIPTION"))
tools::package_dependencies() also retrieves the
distal/indirect upsteam dependencies.
package_dependencies("ComplexHeatmap", which = "strong", recursive = TRUE)
## $ComplexHeatmap
## [1] "methods" "grid" "graphics" "stats"
## [5] "grDevices" "circlize" "GetoptLong" "colorspace"
## [9] "clue" "RColorBrewer" "GlobalOptions" "png"
## [13] "digest" "IRanges" "matrixStats" "foreach"
## [17] "doParallel" "codetools" "rjson" "crayon"
## [21] "utils" "BiocGenerics" "S4Vectors" "stats4"
## [25] "shape" "cluster" "iterators" "parallel"
However, the previous code only retrieves all recursive strong
dependencies. We need all strong and weak parents of our package \(P\), but for more upstream dependencies, we
only need strong ones. This can be simply done by send all parents of
ComplexHeatmap to
tools::package_dependencies().
package_dependencies(package_dependencies("ComplexHeatmap", which = "all")[[1]],
which = "strong", recursive = TRUE)
## $methods
## character(0)
##
## $grid
## character(0)
##
## $graphics
## character(0)
##
## $stats
## character(0)
##
## $grDevices
## character(0)
##
## $circlize
## [1] "graphics" "GlobalOptions" "shape" "grDevices"
## [5] "utils" "stats" "colorspace" "methods"
## [9] "grid"
##
## $GetoptLong
## [1] "rjson" "GlobalOptions" "methods" "crayon"
## [5] "utils" "grDevices"
##
## $colorspace
## [1] "methods" "graphics" "grDevices" "stats"
##
## $clue
## [1] "stats" "cluster" "graphics" "methods" "grDevices" "utils"
##
## $RColorBrewer
## character(0)
##
## $GlobalOptions
## [1] "methods" "utils"
##
## $png
## character(0)
##
## $digest
## [1] "utils"
##
## $IRanges
## [1] "methods" "utils" "stats" "BiocGenerics" "S4Vectors"
## [6] "stats4" "graphics"
##
## $matrixStats
## character(0)
##
## $foreach
## [1] "codetools" "utils" "iterators"
##
## $doParallel
## [1] "foreach" "iterators" "parallel" "utils" "codetools"
##
## $codetools
## character(0)
##
## $testthat
## [1] "brio" "callr" "cli" "desc" "digest" "evaluate"
## [7] "jsonlite" "lifecycle" "magrittr" "methods" "pkgload" "praise"
## [13] "processx" "ps" "R6" "rlang" "utils" "waldo"
## [19] "withr" "glue" "fs" "pkgbuild" "rprojroot" "diffobj"
## [25] "graphics" "grDevices" "crayon" "tools" "stats"
##
## $knitr
## [1] "evaluate" "highr" "methods" "tools" "xfun" "yaml"
## [7] "grDevices" "stats"
##
## $markdown
## [1] "utils" "xfun" "litedown" "commonmark" "grDevices"
## [6] "stats" "tools"
##
## $dendsort
## character(0)
##
## $jpeg
## character(0)
##
## $tiff
## character(0)
##
## $fastcluster
## character(0)
##
## $EnrichedHeatmap
## [1] "methods" "grid" "ComplexHeatmap" "GenomicRanges"
## [5] "matrixStats" "stats" "GetoptLong" "Rcpp"
## [9] "utils" "locfit" "circlize" "IRanges"
## [13] "graphics" "grDevices" "colorspace" "clue"
## [17] "RColorBrewer" "GlobalOptions" "png" "digest"
## [21] "foreach" "doParallel" "codetools" "stats4"
## [25] "BiocGenerics" "S4Vectors" "GenomeInfoDb" "XVector"
## [29] "rjson" "crayon" "shape" "lattice"
## [33] "UCSC.utils" "GenomeInfoDbData" "tools" "zlibbioc"
## [37] "cluster" "iterators" "parallel" "httr"
## [41] "jsonlite" "curl" "mime" "openssl"
## [45] "R6" "askpass" "sys"
##
## $dendextend
## [1] "utils" "stats" "datasets" "magrittr" "ggplot2"
## [6] "viridis" "cli" "glue" "grDevices" "grid"
## [11] "gtable" "isoband" "lifecycle" "MASS" "mgcv"
## [16] "rlang" "scales" "tibble" "vctrs" "withr"
## [21] "viridisLite" "gridExtra" "graphics" "methods" "nlme"
## [26] "Matrix" "splines" "farver" "labeling" "R6"
## [31] "RColorBrewer" "pillar" "pkgconfig" "lattice" "utf8"
##
## $grImport
## [1] "methods" "grDevices" "graphics" "grid" "XML" "utils"
##
## $grImport2
## [1] "methods" "grDevices" "grid" "XML" "png" "jpeg"
## [7] "base64enc" "utils"
##
## $glue
## [1] "methods"
##
## $GenomicRanges
## [1] "methods" "stats4" "BiocGenerics" "S4Vectors"
## [5] "IRanges" "GenomeInfoDb" "utils" "stats"
## [9] "XVector" "graphics" "UCSC.utils" "GenomeInfoDbData"
## [13] "tools" "zlibbioc" "httr" "jsonlite"
## [17] "curl" "mime" "openssl" "R6"
## [21] "askpass" "sys"
##
## $gridtext
## [1] "curl" "grid" "grDevices" "markdown" "rlang"
## [6] "Rcpp" "png" "jpeg" "stringr" "xml2"
## [11] "methods" "utils" "xfun" "litedown" "cli"
## [16] "glue" "lifecycle" "magrittr" "stringi" "vctrs"
## [21] "commonmark" "tools" "stats"
##
## $pheatmap
## [1] "grid" "RColorBrewer" "scales" "gtable" "stats"
## [6] "grDevices" "graphics" "cli" "glue" "lifecycle"
## [11] "rlang" "farver" "labeling" "R6" "viridisLite"
## [16] "utils" "methods"
##
## $gridGraphics
## [1] "grid" "graphics" "grDevices"
##
## $gplots
## [1] "gtools" "stats" "caTools" "KernSmooth" "methods"
## [6] "bitops" "utils"
##
## $rmarkdown
## [1] "bslib" "evaluate" "fontawesome" "htmltools" "jquerylib"
## [6] "jsonlite" "knitr" "methods" "tinytex" "tools"
## [11] "utils" "xfun" "yaml" "base64enc" "cachem"
## [16] "fastmap" "grDevices" "lifecycle" "memoise" "mime"
## [21] "rlang" "sass" "digest" "highr" "stats"
## [26] "cli" "glue" "fs" "R6" "rappdirs"
##
## $Cairo
## [1] "grDevices" "graphics"
##
## $magick
## [1] "Rcpp" "magrittr" "curl" "methods" "utils"
Now we have the following two types of dependency packages:
The two types of dependency are generated in the following code:
parents = package_dependencies("ComplexHeatmap", which = "all")[[1]]
lt_upstream = package_dependencies(parents, which = "strong", recursive = TRUE)
As we will use heatmap for visualization, we need to construct a
heatmap from parents and lt_upstream. We can
put one type of dependency packages to the rows (e.g. parents) and the
second type of dependency packages to the columns (e.g. all upstream
dependencies).
We create the following matrix \(M\) where if \(M_{i,j} = 1\), it means the \(j\)-th package is needed directly or indirectly by the \(i\)-th parent, and finally needed by your package \(P\).
all_upstream = unique(unlist(lt_upstream))
dep_mat = matrix(0, nrow = length(parents), ncol = length(all_upstream))
rownames(dep_mat) = parents
colnames(dep_mat) = all_upstream
for(i in seq_along(lt_upstream)) {
dep_mat[i, lt_upstream[[i]]] = 1
}
As dep_mat is a matrix, we directly use
Heatmap() to visualize it.
library(ComplexHeatmap)
Heatmap(dep_mat)
The heatmap can already show risky parents and some patterns of how these risky parents affect ComplexHeatmap.
As the values in the matrix are discrete and more like identity values. We use a Mosaic-style visualization.
Heatmap(dep_mat, col = c("0" = "grey", "1" = 2), rect_gp = gpar(col = "white"),
column_names_rot = 60)
It looks much prettier.
There are five dependency categories for all parent packages.
However, the previous method
(i.e. package_dependencies("ComplexHeatmap", which = "all"))
can not capture such information. To have such information, we have to
run package_dependencies() five times and each time we only
specify one specific dependency category.
lt_parents = list(
Depends = package_dependencies("ComplexHeatmap", which = "Depends")[[1]],
Imports = package_dependencies("ComplexHeatmap", which = "Imports")[[1]],
LinkingTo = package_dependencies("ComplexHeatmap", which = "LinkingTo")[[1]],
Suggests = package_dependencies("ComplexHeatmap", which = "Suggests")[[1]],
Enhances = package_dependencies("ComplexHeatmap", which = "Enhances")[[1]]
)
parent_types = rep(names(lt_parents), times = sapply(lt_parents, length))
parents = unlist(lt_parents)
parent_types
## [1] "Depends" "Depends" "Depends" "Depends" "Depends" "Imports"
## [7] "Imports" "Imports" "Imports" "Imports" "Imports" "Imports"
## [13] "Imports" "Imports" "Imports" "Imports" "Imports" "Imports"
## [19] "Suggests" "Suggests" "Suggests" "Suggests" "Suggests" "Suggests"
## [25] "Suggests" "Suggests" "Suggests" "Suggests" "Suggests" "Suggests"
## [31] "Suggests" "Suggests" "Suggests" "Suggests" "Suggests" "Suggests"
## [37] "Suggests" "Suggests"
Element orders in parents might be changed, we
regenerate dep_mat.
dep_mat = matrix(0, nrow = length(parents), ncol = length(all_upstream))
rownames(dep_mat) = parents
colnames(dep_mat) = all_upstream
for(i in seq_along(lt_upstream)) {
dep_mat[i, lt_upstream[[i]]] = 1
}
We simply split the heatmap by rows.
Heatmap(dep_mat, col = c("0" = "grey", "1" = 2), rect_gp = gpar(col = "white"),
column_names_rot = 60,
row_split = parent_types)
We don’t really need the hierarchical clustering. Instead, we can provide a self-defined orders. In the following code, we simply order all packages by the total number of packages by rows and by columns.
row_order = order(rowSums(dep_mat))
column_order = order(colSums(dep_mat), decreasing = TRUE)
Heatmap(dep_mat, col = c("0" = "grey", "1" = 2), rect_gp = gpar(col = "white"),
column_names_rot = 60,
row_split = parent_types,
row_order = row_order, column_order = column_order)
In R, there are CRAN/Bioconductor packages, also base packages. We can additionally split the heatmap by columns into base packages and contributed packages.
tb = installed.packages()
l = tb[, "Priority"] == "base"
l[is.na(l)] = FALSE
base_packages = tb[l, "Package"]
base_packages
## base compiler datasets grDevices graphics grid
## "base" "compiler" "datasets" "grDevices" "graphics" "grid"
## methods parallel splines stats stats4 tcltk
## "methods" "parallel" "splines" "stats" "stats4" "tcltk"
## tools utils
## "tools" "utils"
pkg_type = ifelse(colnames(dep_mat) %in% base_packages, "base", "contrib")
Heatmap(dep_mat, col = c("0" = "grey", "1" = 2), rect_gp = gpar(col = "white"),
column_names_rot = 60,
row_split = parent_types, column_split = pkg_type,
row_order = row_order, column_order = column_order)
Note package_dependencies() only includes dependencies
of CRAN/Bioconductor packages, and base packages are not included.
To enhance the visual difference of base packages and contributed
packages, we can assign different colors for them. As the original
matrix dep_mat are composed of 0 and 1, we simply multiply
the values by 2 if the column package are contributed packages.
dep_mat[, pkg_type == "contrib"] = dep_mat[, pkg_type == "contrib"]*2
Heatmap(dep_mat, col = c("0" = "grey", "1" = 2, "2" = 4), rect_gp = gpar(col = "white"),
column_names_rot = 60,
row_split = parent_types, column_split = pkg_type,
row_order = row_order, column_order = column_order,
show_heatmap_legend = FALSE)
We can already have some impression of which parents import more upstream packages by the number of red and blue grids per row. We can add a row annotation of numbers of upstream dependencies of parents to give a quick and quantitative visualization of such information.
n_dep = apply(dep_mat, 1, function(x) sum(x > 0))
Heatmap(dep_mat, col = c("0" = "grey", "1" = 2, "2" = 4), rect_gp = gpar(col = "white"),
column_names_rot = 60,
row_split = parent_types, column_split = pkg_type,
row_order = row_order, column_order = column_order,
show_heatmap_legend = FALSE) +
rowAnnotation(n_dep = anno_barplot(n_dep, gp = gpar(fill = "grey", col = NA)), width = unit(2, "cm"))
Last, as we add the row annotation as the second element in the heatmap list, row names, i.e. the parent packages, are not shown. There are two ways to add these package names:
Heatmap(dep_mat, col = c("0" = "grey", "1" = 2, "2" = 4), rect_gp = gpar(col = "white"),
column_names_rot = 60,
row_split = parent_types, column_split = pkg_type,
row_order = row_order, column_order = column_order,
show_heatmap_legend = FALSE) +
rowAnnotation(n_dep = anno_barplot(n_dep, gp = gpar(fill = "grey", col = NA)), width = unit(2, "cm")) +
rowAnnotation(label = anno_text(parents))
Heatmap(dep_mat, col = c("0" = "grey", "1" = 2, "2" = 4), rect_gp = gpar(col = "white"),
column_names_rot = 60,
row_split = parent_types, column_split = pkg_type,
row_order = row_order, column_order = column_order,
show_heatmap_legend = FALSE,
right_annotation = rowAnnotation(n_dep = anno_barplot(n_dep, gp = gpar(fill = "grey", col = NA)),
width = unit(2, "cm"))
)
This is the basic idea of the visualization implemented in the pkgndep package.
library(pkgndep)
plot(pkgndep("ComplexHeatmap"), fix_size = FALSE)
## retrieve package database from CRAN/Bioconductor (3.19)...
## - 25555 remote packages on CRAN/Bioconductor.
## - 381 packages installed locally.
## prepare dependency table...
## prepare reverse dependency table...