R session 中软件包依赖关系网络

顾祖光 · November 2023 · 版权信息

在R markdown文档中将sessionInfo()放在末尾几乎已经成为一种标准。sessionInfo()能够打印出加载到R session中的包的列表。但这些包之间的依赖关系又如何呢?在这篇博文中,让我们来看看。

让我们打开一个新的R session并仅加载ggplot2包:

library(ggplot2)

接下来我们运行sessionInfo()

x1 = sessionInfo()

打印x1

x1
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.2.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] C/UTF-8/C/C/C/C
## 
## time zone: Europe/Berlin
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.4.4
## 
## loaded via a namespace (and not attached):
##  [1] utf8_1.2.3       R6_2.5.1         tidyselect_1.2.0 magrittr_2.0.3   gtable_0.3.4    
##  [6] glue_1.6.2       tibble_3.2.1     pkgconfig_2.0.3  generics_0.1.3   dplyr_1.1.3     
## [11] lifecycle_1.0.3  cli_3.6.1        fansi_1.0.5      scales_1.2.1     grid_4.3.1      
## [16] vctrs_0.6.4      withr_2.5.1      compiler_4.3.1   munsell_0.5.0    pillar_1.9.0    
## [21] colorspace_2.1-0 rlang_1.1.1

只有二十几个包被载入,看起来不是很多。返回变量x1是一个列表,其中包含了直接或间接加载的包列表。这些包可以分为三组:

base_pkgs = x1$basePkgs
other_pkgs = sapply(x1$otherPkgs, function(x) x$Package)
loaded_pkgs = sapply(x1$loadedOnly, function(x) x$Package)

对于session中的包,现在我们需要知道它们的依赖关系。这里我们使用pkgndep包。所有本地安装的包都作为“包数据库”用来来查询依赖关系。

library(pkgndep)
db = reformat_db(installed.packages())
## prepare dependency table...
## prepare reverse dependency table...

loaded_pkgs均通过other_packages以直接或者间接的方式导入R session,所以 我们遍历每个other_packages中的包,获取其直接和间接的上游依赖关系。这里我们只使用“强”依赖,即依赖关系为“Depends”、“Imports”和“LinkingTo”的依赖包。

mat = matrix(nrow = 0, ncol = 3)

for(pkg in other_pkgs) {
    mat = rbind(mat, db$package_dependencies(pkg, recursive = TRUE, which = "strong"))
}
mat = unique(mat)

基础包直接随着R发布,我们不关心基于它们的依赖关系,因此我们删除与基础包的依赖关系。我们只将包限制为“其他包”和“加载包”。

all_pkgs = c(other_pkgs, loaded_pkgs)
mat = mat[mat[, 1] %in% all_pkgs & mat[, 2] %in% all_pkgs, , drop = FALSE]
mat = mat[!mat[, 1] %in% pkgndep:::BASE_PKGS | mat[, 2] %in% pkgndep:::BASE_PKGS, , drop = FALSE]
head(mat)
##      package   dependency  dep_fields
## [1,] "ggplot2" "cli"       "Imports" 
## [2,] "ggplot2" "glue"      "Imports" 
## [3,] "ggplot2" "grid"      "Imports" 
## [4,] "ggplot2" "gtable"    "Imports" 
## [5,] "ggplot2" "lifecycle" "Imports" 
## [6,] "ggplot2" "rlang"     "Imports"

接下来我们使用DiagrammeR包来可视化依赖关系。我们首先生成DOT代码:

all_nodes = unique(c(mat[, 1], mat[, 2], other_pkgs, loaded_pkgs))
node_col = rep("black", length(all_nodes))
node_col[all_nodes %in% other_pkgs] = "red"
node_col[all_nodes %in% loaded_pkgs] = "blue"

library(glue)
nodes = glue("  \"{all_nodes}\" [color=\"{node_col}\"];", collapse = FALSE)

dep_col = c(2, 4, 3, 5, 6)
dep_col = rgb(t(col2rgb(dep_col)), max = 255)
names(dep_col) = c("Depends", "Imports", "LinkingTo", "Suggests", "Enhances")

edges = glue("  \"{mat[, 1]}\" -> \"{mat[, 2]}\" [color=\"{dep_col[mat[, 3]]}\"];", collapse = FALSE)

dot = paste(
    c("digraph {",
      "  nodesep=0.05",
      "  rankdir=LR;", 
      "  graph [overlap = true];",
      "  node[shape = box];",
      nodes,
      edges,
      "}"),
    collapse = "\n"
)
cat(dot)
## digraph {
##   nodesep=0.05
##   rankdir=LR;
##   graph [overlap = true];
##   node[shape = box];
##   "ggplot2" [color="red"];
##   "gtable" [color="blue"];
##   "lifecycle" [color="blue"];
##   "scales" [color="blue"];
##   "tibble" [color="blue"];
##   "vctrs" [color="blue"];
##   "munsell" [color="blue"];
##   "pillar" [color="blue"];
##   "cli" [color="blue"];
##   "glue" [color="blue"];
##   "grid" [color="blue"];
##   "rlang" [color="blue"];
##   "withr" [color="blue"];
##   "R6" [color="blue"];
##   "fansi" [color="blue"];
##   "magrittr" [color="blue"];
##   "pkgconfig" [color="blue"];
##   "colorspace" [color="blue"];
##   "utf8" [color="blue"];
##   "tidyselect" [color="blue"];
##   "generics" [color="blue"];
##   "dplyr" [color="blue"];
##   "compiler" [color="blue"];
##   "ggplot2" -> "cli" [color="#2297E6"];
##   "ggplot2" -> "glue" [color="#2297E6"];
##   "ggplot2" -> "grid" [color="#2297E6"];
##   "ggplot2" -> "gtable" [color="#2297E6"];
##   "ggplot2" -> "lifecycle" [color="#2297E6"];
##   "ggplot2" -> "rlang" [color="#2297E6"];
##   "ggplot2" -> "scales" [color="#2297E6"];
##   "ggplot2" -> "tibble" [color="#2297E6"];
##   "ggplot2" -> "vctrs" [color="#2297E6"];
##   "ggplot2" -> "withr" [color="#2297E6"];
##   "gtable" -> "cli" [color="#2297E6"];
##   "gtable" -> "glue" [color="#2297E6"];
##   "gtable" -> "grid" [color="#2297E6"];
##   "gtable" -> "lifecycle" [color="#2297E6"];
##   "gtable" -> "rlang" [color="#2297E6"];
##   "lifecycle" -> "cli" [color="#2297E6"];
##   "lifecycle" -> "glue" [color="#2297E6"];
##   "lifecycle" -> "rlang" [color="#2297E6"];
##   "scales" -> "cli" [color="#2297E6"];
##   "scales" -> "glue" [color="#2297E6"];
##   "scales" -> "lifecycle" [color="#2297E6"];
##   "scales" -> "munsell" [color="#2297E6"];
##   "scales" -> "R6" [color="#2297E6"];
##   "scales" -> "rlang" [color="#2297E6"];
##   "tibble" -> "fansi" [color="#2297E6"];
##   "tibble" -> "lifecycle" [color="#2297E6"];
##   "tibble" -> "magrittr" [color="#2297E6"];
##   "tibble" -> "pillar" [color="#2297E6"];
##   "tibble" -> "pkgconfig" [color="#2297E6"];
##   "tibble" -> "rlang" [color="#2297E6"];
##   "tibble" -> "vctrs" [color="#2297E6"];
##   "vctrs" -> "cli" [color="#2297E6"];
##   "vctrs" -> "glue" [color="#2297E6"];
##   "vctrs" -> "lifecycle" [color="#2297E6"];
##   "vctrs" -> "rlang" [color="#2297E6"];
##   "munsell" -> "colorspace" [color="#2297E6"];
##   "pillar" -> "cli" [color="#2297E6"];
##   "pillar" -> "glue" [color="#2297E6"];
##   "pillar" -> "lifecycle" [color="#2297E6"];
##   "pillar" -> "rlang" [color="#2297E6"];
##   "pillar" -> "utf8" [color="#2297E6"];
##   "pillar" -> "vctrs" [color="#2297E6"];
## }

然后我们使用grViz()生成依赖关系网络图。

DiagrammeR::grViz(dot)

看起来依赖关系网络比简单列出包要略微复杂一些。图中还有一些未连接到ggplot2的包,例如dplyr。它们由ggplot2或其上游包作为“弱依赖”间接加载到R session中。

我们将代码包装为可以重复使用的函数loaded_pkgs()。在函数内部,我们使用callr包在一个新的R session中调用library(pkg)和获取session信息。

loaded_pkgs = function(pkg) {
    for(i in seq_along(pkg)) {
        library(pkg[i], character.only=TRUE)
    }
    session_info = sessionInfo()

    base_pkgs = session_info$basePkgs
    other_pkgs = sapply(session_info$otherPkgs, function(x)x$Package)
    loaded_pkgs = sapply(session_info$loadedOnly, function(x)x$Package)

    lt = list(base_pkgs = base_pkgs,
              other_pkgs = other_pkgs,
              loaded_pkgs = loaded_pkgs)

    jsonlite::toJSON(lt)
}

dep_in_session = function(pkg, db, dep_group = "strong", rankdir = "LR") {

    session_info = jsonlite::fromJSON(callr::r(loaded_pkgs, args = list(pkg = pkg)))

    base_pkgs = session_info$base_pkgs
    other_pkgs = session_info$other_pkgs
    loaded_pkgs = session_info$loaded_pkgs

    mat = matrix(nrow = 0, ncol = 3)

    for(pkg in other_pkgs) {
        mat = rbind(mat, db$package_dependencies(pkg, recursive = TRUE, which = dep_group))
    }
    mat = unique(mat)
    mat = mat[!mat[, 1] %in% pkgndep:::BASE_PKGS | mat[, 2] %in% pkgndep:::BASE_PKGS, , drop = FALSE]

    all_pkgs = c(other_pkgs, loaded_pkgs)
    mat = mat[mat[, 1] %in% all_pkgs & mat[, 2] %in% all_pkgs, , drop = FALSE]

    all_nodes = unique(c(mat[, 1], mat[, 2], other_pkgs, loaded_pkgs))
    node_col = rep("black", length(all_nodes))
    node_col[all_nodes %in% other_pkgs] = "red"
    node_col[all_nodes %in% loaded_pkgs] = "blue"

    nodes = glue::glue("  \"{all_nodes}\" [color=\"{node_col}\"];", collapse = FALSE)

    dep_col = c(2, 4, 3, 5, 6)
    dep_col = rgb(t(col2rgb(dep_col)), max = 255)
    names(dep_col) = c("Depends", "Imports", "LinkingTo", "Suggests", "Enhances")

    edges = glue::glue("  \"{mat[, 1]}\" -> \"{mat[, 2]}\" [color=\"{dep_col[mat[, 3]]}\"];", collapse = FALSE)

    dot = paste(
        c("digraph {",
          "  nodesep=0.05",
          glue::glue("  rankdir={rankdir};"), 
          "  graph [overlap = true];",
          "  node[shape = box];",
          nodes,
          edges,
          "}"),
        collapse = "\n"
    )

    DiagrammeR::grViz(dot)
}

在前面的示例中,我们只考虑了ggplot2上游的“强依赖关系”。如果我们包含所有依赖关系会怎样?

dep_in_session("ggplot2", db = db, dep_group = "all")

它变得更加复杂!特别是我们可以看到存在许多双向依赖关系,例如, A <-> B,其中A是B的强依赖项,而B是A的弱依赖项。

接下来让我们检查一个Bioconductor包DESeq2

我们首先只考虑DESeq2上游的强依赖性。

dep_in_session("DESeq2", db = db, dep_group = "strong")

它已经非常复杂,但有趣的是,DESeq2的依赖项可以很好地分为两组。第一组与Bioconductor包相关,其中很多直接被附加到搜索路径(红色框:在搜索路径中可见;红色链接:“Depends”关系);第二组主要与ggplot2及其上游包相关,它们都间接加载到R会话中(蓝色链接:“Imports”关系)。

如果我们包含所有依赖类型会怎样?现在我们需要将布局更改为“TB”样式(top-bottom)。正如我们所看到的,一个简单的命令library(DESeq2)给你的 R session带来了一个dependency monster。

dep_in_session("DESeq2", db = db, dep_group = "all", rankdir = "TB")

最后,让我们检查一下如果仅载入Seurat包。

dep_in_session("Seurat", db = db, dep_group = "strong")

眼花缭乱!

如果包含Seurat所有的依赖关系,会变得异常庞大和复杂。

dep_in_session("Seurat", db = db, dep_group = "all")
sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS 26.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] C.UTF-8/UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
## 
## time zone: Europe/Berlin
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] glue_1.8.0     pkgndep_1.99.3 knitr_1.50     colorout_1.3-2
## 
## loaded via a namespace (and not attached):
##  [1] DiagrammeR_1.0.11     sass_0.4.9            BiocVersion_3.20.0    shape_1.4.6.1        
##  [5] blogdown_1.19         digest_0.6.37         magrittr_2.0.3        evaluate_1.0.3       
##  [9] grid_4.4.2            RColorBrewer_1.1-3    bookdown_0.44         iterators_1.0.14     
## [13] circlize_0.4.16       fastmap_1.2.0         foreach_1.5.2         doParallel_1.0.17    
## [17] jsonlite_1.9.0        processx_3.8.6        GlobalOptions_0.1.4   ps_1.9.0             
## [21] ComplexHeatmap_2.25.2 codetools_0.2-20      jquerylib_0.1.4       cli_3.6.4            
## [25] rlang_1.1.5           crayon_1.5.3          visNetwork_2.1.2      cachem_1.1.0         
## [29] yaml_2.3.10           tools_4.4.2           parallel_4.4.2        colorspace_2.1-1     
## [33] GetoptLong_1.0.5      BiocGenerics_0.52.0   hash_2.2.6.3          R6_2.6.1             
## [37] png_0.1-8             matrixStats_1.5.0     stats4_4.4.2          lifecycle_1.0.4      
## [41] S4Vectors_0.44.0      htmlwidgets_1.6.4     IRanges_2.40.1        clue_0.3-66          
## [45] cluster_2.1.6         callr_3.7.6           bslib_0.9.0           xfun_0.51            
## [49] rstudioapi_0.17.1     rjson_0.2.23          htmltools_0.5.8.1     rmarkdown_2.29       
## [53] compiler_4.4.2

本文使用 CC BY-NC-SA 4.0 协议发布。