scATAC-seq 多样本整合分析

引言
之前我介绍过 signac 官网的 scATAC 多样本整合。里面是一个样本一个样本分析的,比较繁琐,这里我写一个批量分析的流程,明白易懂,可以给分析减少很多繁琐的操作和代码。
合并 common peaks
读取每个样本的 peaks 文件进行合并:
library(scDblFinder)
library(GenomicRanges)
library(Signac)
library(future)
library(Seurat)
library(tidyverse)
library(BiocParallel)

sps <- c(“WT_rep1″,”WT_rep2″,”WT_rep3″,”Treat_rep1″,”Treat_rep2″,”Treat_rep3”)
dirs <- c(“WT-1″,”WT-2″,”WT-3″,”Treat-1″,”Treat-2″,”Treat-3”)

==============================================================================

get common peaks

==============================================================================

x = 1

lapply(seq_along(sps),function(x){
peak <- paste(dirs[x],”_peaks.bed”,sep =””)

peaks <- read.table(file = peak, col.names = c(“chr”, “start”, “end”)) |>
makeGRangesFromDataFrame()

return(peaks)
}) |> GRangesList() |> unlist() |> reduce() -> combined.peaks

peakwidths <- width(combined.peaks) combined.peaks <- combined.peaks[peakwidths < 10000 & peakwidths > 20]
combined.peaks

去除双细胞
我们使用 scDblFinder 获取鉴定为双细胞的细胞 id:

==============================================================================

filter doublets

==============================================================================

x = 1

lapply(seq_along(sps),function(x){
path <- paste(dirs[x], “/filtered_peaks_bc_matrix”,sep =””)

fragments_path <- paste(dirs[x],”_A_fragments.tsv.gz”,sep =””)

tmp <- Read10X(path)

chrom_assay <- CreateChromatinAssay(counts = tmp,
sep = c(“:”, “-“),
fragments = fragments_path,
min.cells = 10,
min.features = 200)

sca <- CreateSeuratObject(counts = chrom_assay,
assay = “peaks”
# meta.data = metadata
)

# # remove double cells
sce <- as.SingleCellExperiment(sca)

sce <- scDblFinder(sce,
artificialDoublets = 1,
aggregateFeatures = TRUE,
nfeatures = 25,
processing = “normFeatures”)

print(table(sce$scDblFinder.class))
sce <- sce[, sce$scDblFinder.class == “singlet”]
cell.id <- colnames(sce)

return(cell.id)
}) -> cells.keep

save(cells.keep,file = “cells.keep.rda”)
构建 seurat 对象
然后使用 FeatureMatrix 对第一步的 common peaks 定量,获取 counts 矩阵,接着构建 seurat 对象,最后合并所有样本:
load(“cells.keep.rda”)

==============================================================================

create SeuratObject

==============================================================================

plan(“multicore”, workers = 10)
options(future.globals.maxSize = 100000 * 1024^2)

x = 1

lapply(seq_along(sps),function(x){
fragments_path <- paste(dirs[x],”_A_fragments.tsv.gz”,sep =””)

frag <- CreateFragmentObject(path = fragments_path, cells = cells.keep[[x]])

counts <- FeatureMatrix(fragments = frag,
features = combined.peaks,
cells = cells.keep[[x]])

chrom_assay <- CreateChromatinAssay(counts = counts,
fragments = fragments_path,
genome = ‘mm10’,
min.cells = 10,
min.features = 200)

sca <- CreateSeuratObject(counts = chrom_assay, assay = “atac”, project = sps[x])

return(sca)
}) -> sca.list

plan(“sequential”)

combine list into object

sca.all <- merge(x = sca.list[[1]],
y = sca.list[-1],
add.cell.ids = sps)
sca.all

save(sca.all,file = “sct.all.rda”)
添加基因注释

==============================================================================

BiocManager::install(“EnsDb.Mmusculus.v79”)

library(EnsDb.Mmusculus.v79)
library(ensembldb)

extract gene annotations from EnsDb

annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)

change to UCSC style since the data was mapped to mm10

seqlevels(annotations) <- paste0(‘chr’, seqlevels(annotations))
genome(annotations) <- “mm10”

add the gene information to the object

Annotation(sca.all) <- annotations
质控和过滤
根据不同指标的 qc 图进行过滤:

==============================================================================

qc

==============================================================================

sca.all <- NucleosomeSignal(object = sca.all)
sca.all <- TSSEnrichment(sca.all)

save(sca.all,file = “sct.all.rda”)

load(“sct.all.rda”)

add blacklist value

sca.all$blacklist_ratio <- FractionCountsInRegion(object = sca.all,
assay = ‘atac’,
regions = Signac::blacklist_mm10)

meta <- sca.all@meta.data
colnames(meta)

qc-plot 4×10

VlnPlot(object = sca.all,
features = c(‘nCount_atac’,
‘TSS.enrichment’, ‘blacklist_ratio’, ‘nucleosome_signal’),
pt.size = 0.1,
ncol = 5)

quantile(sca.all@meta.data$nCount_atac,probs = seq(0,1,0.1))
quantile(sca.all@meta.data$TSS.enrichment,probs = seq(0,1,0.1))
quantile(sca.all@meta.data$nucleosome_signal,probs = seq(0,1,0.1))

filter cells

sca.ft <- subset( x = sca.all, subset = nCount_atac > 2000 &
nCount_atac < 100000 & blacklist_ratio < 0.1 & nucleosome_signal < 2 & TSS.enrichment > 1
)

sca.ft

save(sca.ft,file = “sca.ft.rda”)
初次降维聚类
先看看有没有批次效应:

==============================================================================

Normalization and linear dimensional reduction

==============================================================================

sca.ft <- RunTFIDF(sca.ft)
sca.ft <- FindTopFeatures(sca.ft, min.cutoff = ‘q0’)
sca.ft <- RunSVD(object = sca.ft)

DepthCor(sca.ft)
ElbowPlot(sca.ft, ndims = 50, reduction = “lsi”)
visualize_dim_selection(seurat_obj = sca.ft, reduction = “lsi”, max_dims = 50)

sca.ft <- RunUMAP(object = sca.ft, reduction = ‘lsi’, dims = 2:10)
sca.ft <- FindNeighbors(object = sca.ft, reduction = ‘lsi’, dims = 2:10)

multiple resolution cluster-tree

sca.ft.tmp <- sca.ft
for (res in c(seq(0.01,0.1,by = 0.01),seq(0.1,1,by = 0.1))) {
sca.ft.tmp <- FindClusters(sca.ft.tmp, algorithm = 3, resolution = res)
}

clustree::clustree(sca.ft.tmp, prefix = ‘atac_snn_res.’)
rm(sca.ft.tmp)

sca.ft <- FindClusters(object = sca.ft, algorithm = 3, resolution = 0.09)

p1 <- DimPlot(object = sca.ft,
label = TRUE,
group.by = c(“orig.ident”, “seurat_clusters”)
) + NoLegend()

p1

save(sca.ft,file = “sca.ft.rda”)
去除批次效应
这里提供了两种方法 harmony 和官网示例的 CCA,都可以试试看看效果:

==============================================================================

remove batch effect

==============================================================================

load(“sca.ft.rda”)

library(harmony)

#

sca.ft@meta.data$tech <- sapply(strsplit(sca.ft@meta.data$orig.ident,split = “_”),”[“,1)

sca.ft@meta.data |> head()

#

sca.int <- RunHarmony(object = sca.ft,

group.by.vars = c(“orig.ident”, “tech”),

reduction.use = ‘lsi’,

assay.use = ‘atac’,

project.dim = FALSE)

#

sca.int <- RunUMAP(sca.int, dims = 2:10, reduction = ‘harmony’)

sca.int <- FindNeighbors(object = sca.int, reduction = ‘harmony’, dims = 2:10)

sca.int <- FindClusters(object = sca.int, algorithm = 3, resolution = 0.2)

#

p2 <- DimPlot(sca.int, group.by = c(“orig.ident”, “seurat_clusters”),

label = T,

pt.size = 0.1)

#

p1 / p2

=======================================

find integration anchors between 10x and sci-ATAC

object.list <- SplitObject(sca.ft, split.by = “orig.ident”)

anchors <- FindIntegrationAnchors(object.list = object.list,
anchor.features = rownames(object.list[[1]]),
reduction = “rlsi”,
dims = 2:10)

integrate data and create a new merged object

sca.int <- IntegrateEmbeddings(anchorset = anchors,
reductions = sca.ft[[“lsi”]],
new.reduction.name = “int_lsi”,
dims.to.integrate = 1:10)

re-compute the UMAP using corrected LSI embeddings

sca.int <- RunUMAP(sca.int, dims = 2:10, reduction = ‘int_lsi’)
sca.int <- FindNeighbors(object = sca.int, reduction = ‘int_lsi’, dims = 2:10)

multiple resolution cluster-tree

sca.int.tmp <- sca.int
for (res in c(seq(0.01,0.1,by = 0.01),seq(0.1,1,by = 0.1))) {
sca.int.tmp <- FindClusters(sca.int.tmp, algorithm = 3, resolution = res)
}

clustree::clustree(sca.int.tmp, prefix = ‘atac_snn_res.’)
rm(sca.int.tmp)

sca.int <- FindClusters(object = sca.int, algorithm = 3, resolution = 0.2)

p2 <- DimPlot(sca.int, group.by = c(“orig.ident”, “seurat_clusters”),
label = T,
pt.size = 0.1)

p1 / p2

save(sca.int,file = “sca.int.rda”)
构建基因活性矩阵
使用每个基因上游的 promoter 的强度来表征基因表达强度:

==============================================================================

Create a gene activity matrix

==============================================================================

load(“sca.int.rda”)

# quantify gene activity

gene.activities <- GeneActivity(sca.int)

add the gene activity matrix to the Seurat object as a new assay and normalize it

sca.int[[‘ACTIVITY’]] <- CreateAssayObject(counts = gene.activities)

normalize gene activities

DefaultAssay(sca.int) <- “ACTIVITY”
sca.int <- NormalizeData(object = sca.int)
sca.int <- ScaleData(sca.int, features = rownames(sca.int))
后续可以使用对于的 scRNA-seq 来注释 scATAC 的亚群了。
下面是 PC 选择的函数,用来查看选择多少 PC 比较合适:
visualize_dim_selection <- function(seurat_obj,
reduction = “pca”,
max_dims = 50,
variance_thresholds = c(0.60, 0.70, 0.80, 0.90)) {

# — 1. Input Validation and Data Extraction —
if (!reduction %in% names(seurat_obj@reductions)) {
stop(paste(“Reduction ‘”, reduction, “‘ not found in Seurat object. Please run the appropriate reduction method first.”, sep=””))
}

if (!requireNamespace(“ggplot2”, quietly = TRUE) || !requireNamespace(“patchwork”, quietly = TRUE)) {
stop(“Packages ‘ggplot2’ and ‘patchwork’ are required but not installed.”)
}

reduction_obj <- seurat_obj@reductions[[reduction]]

# Check for importance scores (StDev for PCA, Singular Values for LSI)
if (length(reduction_obj@stdev) > 0) {
importance_scores <- reduction_obj@stdev y_label <- “Standard Deviation” plot_title <- “1. Elbow Plot (Standard Deviation)” } else if (length(reduction_obj@S) > 0) {
importance_scores <- reduction_obj@S
y_label <- “Singular Value”
plot_title <- “1. Elbow Plot (Singular Values)”
} else {
stop(paste(“Could not find importance scores (@stdev or @S) in reduction ‘”, reduction, “‘.”, sep=””))
}

# Ensure max_dims is not greater than available dimensions
max_dims <- min(max_dims, length(importance_scores))

# Extract data for the specified number of dimensions
scores_subset <- importance_scores[1:max_dims]
variance <- scores_subset^2
total_var <- sum(importance_scores^2) # Use all dimensions for total variance
pct_var <- variance / total_var * 100
cumulative_pct <- cumsum(variance) / total_var * 100

# — 2. Calculate Metrics —
marginal_gain <- c(pct_var[1], diff(cumulative_pct))
diff1 <- diff(scores_subset)
diff2 <- diff(diff1)
curvature <- c(0, 0, diff2, rep(NA, max_dims – length(diff2) – 2))

plot_data <- data.frame(
Dimension = 1:max_dims,
importance = scores_subset,
individual_var = pct_var,
cumulative_var = cumulative_pct,
marginal_gain = marginal_gain,
curvature = curvature
)

# — 3. Annotations for Plots —
cumulative_pct_full <- cumsum(importance_scores^2) / total_var dim_at_thresholds <- sapply(variance_thresholds, function(thresh) { which(cumulative_pct_full >= thresh)[1]
})

annotation_data <- data.frame(
dim = dim_at_thresholds,
threshold = variance_thresholds * 100,
label = paste0(dim_at_thresholds, ” Dims\n(“, variance_thresholds * 100, “%)”)
)

colors <- c(“red”, “orange”, “green”, “blue”, “purple”)

cat(“\n=== Automatic Dimension Selection Based on Variance Thresholds ===\n”)
for (i in 1:nrow(annotation_data)) {
cat(sprintf(” %.0f%% variance → Dimension %d\n”,
annotation_data$threshold[i],
annotation_data$dim[i]))
}
cat(“\n”)

# — 4. Generate Plots —
# Plot 1: Elbow Plot
p1 <- ggplot2::ggplot(plot_data, ggplot2::aes(x = Dimension, y = importance)) +
ggplot2::geom_line(color = “steelblue”, size = 1) +
ggplot2::geom_point(color = “steelblue”, size = 2)

for (i in 1:nrow(annotation_data)) {
if (!is.na(annotation_data$dim[i]) && annotation_data$dim[i] <= max_dims) {
p1 <- p1 +
ggplot2::geom_vline(xintercept = annotation_data$dim[i], linetype = “dashed”, color = colors[i], alpha = 0.6) +
ggplot2::annotate(“text”, x = annotation_data$dim[i], y = max(scores_subset) * (1 – i * 0.1),
label = annotation_data$label[i], color = colors[i], size = 3, hjust = -0.1)
}
}
p1 <- p1 + ggplot2::labs(title = plot_title, x = “Dimension”, y = y_label) + ggplot2::theme_bw()

# Plot 2: Cumulative Variance
p2 <- ggplot2::ggplot(plot_data, ggplot2::aes(x = Dimension, y = cumulative_var)) +
ggplot2::geom_line(color = “darkgreen”, size = 1) +
ggplot2::geom_point(color = “darkgreen”, size = 1.5)
for (i in 1:length(variance_thresholds)) {
p2 <- p2 + ggplot2::geom_hline(yintercept = variance_thresholds[i] * 100, linetype = “dashed”, color = colors[i], alpha = 0.6)
}
p2 <- p2 + ggplot2::scale_y_continuous(limits = c(0, 100)) +
ggplot2::labs(title = “2. Cumulative Variance Explained”, x = “Dimension”, y = “Cumulative Variance (%)”) + ggplot2::theme_bw()

# Plot 3: Marginal Gain
p3 <- ggplot2::ggplot(plot_data, ggplot2::aes(x = Dimension, y = marginal_gain)) +
ggplot2::geom_bar(stat = “identity”, fill = “coral”, alpha = 0.7) +
ggplot2::geom_hline(yintercept = 0.5, linetype = “dashed”, color = “red”) +
ggplot2::labs(title = “3. Marginal Gain per Dimension”, x = “Dimension”, y = “Variance Gained (%)”) + ggplot2::theme_bw()

# Plot 4: Curvature
p4 <- ggplot2::ggplot(plot_data[3:max_dims, ], ggplot2::aes(x = Dimension, y = abs(curvature))) +
ggplot2::geom_line(color = “purple”, size = 1) +
ggplot2::geom_point(color = “purple”, size = 1.5) +
ggplot2::labs(title = “4. Curvature (2nd Derivative)”, x = “Dimension”, y = “Absolute Curvature”) + ggplot2::theme_bw()

# — 5. Combine and Print —
combined <- (p1 | p2) / (p3 | p4)
print(combined)

# — 6. Recommendations —
cat(“\n=== Recommendations Based on Analysis ===\n”)
threshold_point <- which(plot_data$marginal_gain < 0.5)[1]
if (!is.na(threshold_point)) {
cat(sprintf(“• Marginal gain <0.5%%: Dimension %d\n”, threshold_point))
}

# Recommend elbow point within a reasonable range (e.g., first 30 dimensions)
elbow_range <- 3:min(30, max_dims)
elbow_point <- which.max(abs(plot_data$curvature[elbow_range])) + (elbow_range[1] – 1)
cat(sprintf(“• Maximum curvature (elbow point): Dimension %d\n”, elbow_point))

for (i in 1:nrow(annotation_data)) {
cat(sprintf(“• %.0f%% variance: Dimension %d\n”,
annotation_data$threshold[i],
annotation_data$dim[i]))
}
cat(“\n”)

return(plot_data)
}
结尾
路漫漫其修远兮,吾将上下而求索。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊老俊俊生信交流群(微信交流群需收取 20 元入群费用,一旦交费,拒不退还!(防止骗子和便于管理)) 。

声明:来自老俊俊的生信笔记,仅代表创作者观点。链接:https://eyangzhen.com/4578.html

老俊俊的生信笔记的头像老俊俊的生信笔记

相关推荐

关注我们
关注我们
购买服务
购买服务
返回顶部