引言
gghic 是一个用于创建灵活、适合发表的 3D 基因组组织数据可视化 R 包。使用 gghic,您可以轻松探索和展示 Hi-C/-接触图、拓扑关联域(TADs)、染色质环、基因注释和其他基因组特征,在一个统一且可定制的框架中。无论您是在分析大规模 Hi-C 实验、可视化多染色体相互作用,还是整合额外的基因组轨道(如 ChIP-seq 或 BigWig 数据),gghic 都为您的研究提供了一个 tidyverse 友好且可扩展的工具包。
参考文档:https://jasonwong-lab.github.io/gghic/articles/gghic.html
安装
devtools::install_github(“jasonwong-lab/gghic”, build_vignettes = TRUE)
示例
加载需要的包:
load_pkg <- function(pkgs) {
for (pkg in pkgs) suppressMessages(require(pkg, character.only = TRUE))
}
load_pkg(
c(
“gghic”, “ggplot2”, “tibble”, “scales”, “dplyr”, “tidyr”, “glue”,
“HiCExperiment”, “GenomicRanges”, “InteractionSet”
)
)
下载测试数据:
download_example_files <- function(cache_dir, check_exists = TRUE) {
if (!file.exists(cache_dir)) dir.create(cache_dir)
files <- list(
“chr4_11-100kb.cool” = “cooler/chr4_11-100kb.cool”,
“chr4_11-5kb.cool” = “cooler/chr4_11-5kb.cool”,
“track1.bigWig” = “bigwig/track1.bigWig”,
“track2.bigWig” = “bigwig/track2.bigWig”,
“gencode-chr4_11.gtf.gz” = “gtf/gencode-chr4_11.gtf.gz”,
“TADs_500kb-chr4_11.tsv” = “tad/TADs_500kb-chr4_11.tsv”,
“loops-chr4_11.txt” = “loop/loops-chr4_11.txt”,
“gis_hic.rds” = “multiway/gis_hic.rds”,
“concatemers.rds” = “multiway/concatemers.rds”
)
url_base <- paste0(
“https://raw.githubusercontent.com/”,
“mhjiang97/gghic-data/refs/heads/master/”
)
for (file_name in names(files)) {
file_path <- file.path(cache_dir, file_name)
if (check_exists && file.exists(file_path)) next
download.file(
paste0(url_base, files[[file_name]]), file_path, method = “curl”
)
}
}
dir_cache <- “../data”
download_example_files(dir_cache)
path_cf_100 <- file.path(dir_cache, “chr4_11-100kb.cool”) path_cf_5 <- file.path(dir_cache, “chr4_11-5kb.cool”) path_gtf <- file.path(dir_cache, “gencode-chr4_11.gtf.gz”) paths_track <- file.path(dir_cache, glue::glue(“track{1:2}.bigWig”)) path_tad <- file.path(dir_cache, “TADs_500kb-chr4_11.tsv”) path_loop <- file.path(dir_cache, “loops-chr4_11.txt”) path_gis_hic <- file.path(dir_cache, “gis_hic.rds”) path_concatemers <- file.path(dir_cache, “concatemers.rds”) 导入数据: hic_100 <- path_cf_100 |>
HiCExperiment::CoolFile() |>
HiCExperiment::import()
hic_5 <- path_cf_5 |>
HiCExperiment::CoolFile() |>
HiCExperiment::import()
scale_data <- function(data, score_column = “balanced”, scale_method = log10) {
if (inherits(data, “HiCExperiment”)) {
gis <- InteractionSet::interactions(data)
} else if (inherits(data, “GInteractions”)) {
gis <- data
} else {
stop(“Input data must be a HiCExperiment or GInteractions object.”)
}
x <- gis |>
tibble::as_tibble() |>
dplyr::mutate(score = scale_method(.data[[score_column]])) |>
dplyr::filter(!is.na(score), !is.infinite(score)) |>
dplyr::mutate(score = scales::oob_squish(score, c(min(score), max(score))))
x
}
x_100 <- scale_data(hic_100) x_5 <- scale_data(hic_5) geom_hic() 是用于可视化 Hi-C/类数据的主函数。它需要以下映射: seqnames1 , start1 , end1 , seqnames2 , start2 , end2 ,和 fill: p <- x_100 |>
dplyr::filter(seqnames1 == “chr11”, seqnames2 == “chr11”) |>
tidyr::drop_na(score) |>
ggplot2::ggplot(
ggplot2::aes(
seqnames1 = seqnames1, start1 = start1, end1 = end1,
seqnames2 = seqnames2, start2 = start2, end2 = end2,
fill = score
)
) +
geom_hic()
p
theme_hic() 是一个用于 geom_hic() 的主题函数,用于增强图形的外观:
p + theme_hic()
gghic() 是一个包装函数,可以接受 HiCExperiment 对象、 GenomicInteractions 对象或 tibble / data.frame 作为输入:
hic_100[“chr11”] |>
gghic()
geom_ideogram() 可以用于向图形中添加染色体 Ideogram:
x_100 |>
dplyr::filter(seqnames1 == “chr11”, seqnames2 == “chr11”) |>
ggplot2::ggplot(
ggplot2::aes(
seqnames1 = seqnames1, start1 = start1, end1 = end1,
seqnames2 = seqnames2, start2 = start2, end2 = end2,
fill = score
)
) +
geom_hic() +
theme_hic() +
geom_ideogram(
genome = “hg19”, highlight = FALSE, length_ratio = 0.7, fontsize = 8
)
gghic() 也可以向图中添加图例:
hic_100[“chr11”] |>
gghic(
ideogram = TRUE, genome = “hg19”, highlight = FALSE, length_ratio = 0.7,
ideogram_fontsize = 8
)
绘制的区域可以通过设置 highlight = TRUE 在染色体图上高亮显示:
hic_100[“chr4:10000000-15000000”] |>
gghic(
ideogram = TRUE, genome = “hg19”, highlight = TRUE, length_ratio = 0.7,
ideogram_fontsize = 8
)
通过使用 geom_annotation() ,可以在图中添加基于轨迹的基因注释:
p <- x_5 |>
dplyr::filter(
seqnames1 == “chr11”, seqnames2 == “chr11”,
center1 > 67000000, center1 < 67100000, center2 > 67000000, center2 < 67100000 ) |>
ggplot2::ggplot(
ggplot2::aes(
seqnames1 = seqnames1, start1 = start1, end1 = end1,
seqnames2 = seqnames2, start2 = start2, end2 = end2,
fill = score
)
) +
geom_hic() +
theme_hic() +
geom_ideogram(
genome = “hg19”, highlight = FALSE, length_ratio = 0.7, fontsize = 8
)
p + geom_annotation(gtf_path = path_gtf, style = “basic”, maxgap = 100000)
style 可以设置为 “arrow” 来改变注释样式,maxgap 用于计算需要多少个不重叠的轨道来进行注释,gene_symbols 可用于指定要绘制的基因:
p + geom_annotation(
gtf_path = path_gtf, style = “arrow”, maxgap = -1,
gene_symbols = c(“GRK2”, “SSH3”, “KDM2A”)
)
染色质环可以使用 geom_loop() 或 geom_loop2() 进行可视化。前者需要指定一个环文件的路径,而后者需要一个 tibble / data.frame ,其中包含环数据,这可以轻松地通过添加额外的美学元素进行定制:
hic_5[“chr11:61915000-65000000”] |>
gghic(loop = TRUE, loop_path = path_loop, loop_is_0based = TRUE)
df_loop <- path_loop |>
read.table(
sep = “\t”,
col.names = c(“seqnames1”, “start1”, “end1”, “seqnames2”, “start2”, “end2”)
) |>
dplyr::filter(
seqnames1 == “chr11”, seqnames2 == “chr11”,
start1 > 61925000, end1 < 67480000, start2 > 61925000, end2 < 67480000
)
keep <- sample(nrow(df_loop), 6) df_loop <- df_loop |>
dplyr::slice(keep) |>
dplyr::mutate(
sample = c(rep(“A”, 3), rep(“B”, 3))
)
hic_5[“chr11:61915000-65000000”] |>
gghic() +
geom_loop2(
data = df_loop,
ggplot2::aes(
seqnames1 = seqnames1, start1 = start1, end1 = end1,
seqnames2 = seqnames2, start2 = start2, end2 = end2,
colour = sample
),
stroke = 1
)
使用 geom_tad() 或 geom_tad2() 可以添加指示 TAD 的三角形。前者需要 TAD 文件的路径,而后者需要包含 TAD 数据的 tibble / data.frame:
x_100 |>
dplyr::filter(
seqnames1 == “chr4”, seqnames2 == “chr4”,
start1 > 50000000, end1 < 80000000, start2 > 50000000, end2 < 80000000 ) |>
gghic(
tad = TRUE, tad_is_0based = TRUE, tad_path = path_tad, tad_colour = “#00ff83”
)
df_tad <- path_tad |>
read.table(
sep = “\t”, header = FALSE, col.names = c(“seqnames”, “start”, “end”)
) |>
dplyr::mutate(start = start + 1) |>
dplyr::filter(seqnames == “chr4”, start > 60000000, end < 70000000) |>
dplyr::mutate(sample = c(“A”, “B”))
x_100 |>
dplyr::filter(
seqnames1 == “chr4”, seqnames2 == “chr4”,
start1 > 60000000, end1 < 70000000, start2 > 60000000, end2 < 70000000 ) |>
gghic() +
geom_tad2(
data = df_tad,
ggplot2::aes(
seqnames = seqnames, start = start, end = end,
colour = sample
),
stroke = 2
) +
ggplot2::scale_color_grey()
在可视化多个染色体时,也可以在图中添加染色体图例和基因轨道:
p <- x_5 |>
dplyr::filter(
center1 > 10000000 & center1 < 11000000 & center2 > 10000000 & center2 < 11000000 ) |>
gghic(
draw_boundary = TRUE,
ideogram = TRUE, genome = "hg19", highlight = TRUE, ideogram_fontsize = 7,
ideogram_width_ratio = 0.08,
annotation = TRUE, include_ncrna = FALSE, gtf_path = path_gtf,
style = "basic", maxgap = 100000, annotation_fontsize = 5,
annotation_width_ratio = 0.05,
expand_xaxis = TRUE
)
p
geom_track() 可用于向图中添加其他数据轨道:
p + geom_track(
data_paths = paths_track, width_ratio = 0.3, data_range = “auto”,
fill = c(“#DC0000B2”, “#00A087B2”)
)
结尾
路漫漫其修远兮,吾将上下而求索。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊老俊俊生信交流群(微信交流群需收取 20 元入群费用,一旦交费,拒不退还!(防止骗子和便于管理)) 。
声明:来自老俊俊的生信笔记,仅代表创作者观点。链接:http://eyangzhen.com/3991.html