引言
走一个酒精性脂肪性肝病(NAFLD)肝脏的单细胞基因表达谱的上下游分析。GSE 编号为 GSE270583,四个样本。
原始数据下载
使用 aspera 下载原始 fastq 文件,查看文件每个样本有 r1 和 r2 文件,但是 r1 和 r2 的序列都是 150bp 长度,可能 r1 的 barcode+umi 序列包含了其他接头序列什么的没有去除:
!/usr/bin/env bash
ascp -QT -l 300m -P33001 -i $HOME/.aspera/connect/etc/asperaweb_id_dsa.openssh era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR295/021/SRR29534021/SRR29534021_1.fastq.gz . && mv SRR29534021_1.fastq.gz SRR29534021_GSM8347727_NCD_replicate_1_ScRNA-seq_Mus_musculus_RNA-Seq_1.fastq.gz
ascp -QT -l 300m -P33001 -i $HOME/.aspera/connect/etc/asperaweb_id_dsa.openssh era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR295/021/SRR29534021/SRR29534021_2.fastq.gz . && mv SRR29534021_2.fastq.gz SRR29534021_GSM8347727_NCD_replicate_1_ScRNA-seq_Mus_musculus_RNA-Seq_2.fastq.gz
ascp -QT -l 300m -P33001 -i $HOME/.aspera/connect/etc/asperaweb_id_dsa.openssh era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR295/018/SRR29534018/SRR29534018_1.fastq.gz . && mv SRR29534018_1.fastq.gz SRR29534018_GSM8347730_JQF_replicate_1_ScRNA-seq_Mus_musculus_RNA-Seq_1.fastq.gz
ascp -QT -l 300m -P33001 -i $HOME/.aspera/connect/etc/asperaweb_id_dsa.openssh era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR295/018/SRR29534018/SRR29534018_2.fastq.gz . && mv SRR29534018_2.fastq.gz SRR29534018_GSM8347730_JQF_replicate_1_ScRNA-seq_Mus_musculus_RNA-Seq_2.fastq.gz
ascp -QT -l 300m -P33001 -i $HOME/.aspera/connect/etc/asperaweb_id_dsa.openssh era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR295/019/SRR29534019/SRR29534019_1.fastq.gz . && mv SRR29534019_1.fastq.gz SRR29534019_GSM8347729_HFD_replicate_2_ScRNA-seq_Mus_musculus_RNA-Seq_1.fastq.gz
ascp -QT -l 300m -P33001 -i $HOME/.aspera/connect/etc/asperaweb_id_dsa.openssh era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR295/019/SRR29534019/SRR29534019_2.fastq.gz . && mv SRR29534019_2.fastq.gz SRR29534019_GSM8347729_HFD_replicate_2_ScRNA-seq_Mus_musculus_RNA-Seq_2.fastq.gz
ascp -QT -l 300m -P33001 -i $HOME/.aspera/connect/etc/asperaweb_id_dsa.openssh era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR295/020/SRR29534020/SRR29534020_1.fastq.gz . && mv SRR29534020_1.fastq.gz SRR29534020_GSM8347728_HFD_replicate_1_ScRNA-seq_Mus_musculus_RNA-Seq_1.fastq.gz
ascp -QT -l 300m -P33001 -i $HOME/.aspera/connect/etc/asperaweb_id_dsa.openssh era-fasp@fasp.sra.ebi.ac.uk:vol1/fastq/SRR295/020/SRR29534020/SRR29534020_2.fastq.gz . && mv SRR29534020_2.fastq.gz SRR29534020_GSM8347728_HFD_replicate_1_ScRNA-seq_Mus_musculus_RNA-Seq_2.fastq.gz
下载参考基因组索引
去 cellranger 官网下载对应物种的索引文件:
wget “https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-GRCm39-2024-A.tar.gz”
tar -zxvf refdata-gex-GRCm39-2024-A.tar.gz
定量
去官网下载 cellranger 软件,然后进行定量:
for i in HFD_rep1 HFD_rep2 JQF_rep1 NCD_rep1
do
../../cellranger_test/cellranger-9.0.0/cellranger count \
–id ${i} \
–transcriptome ../refdata-gex-GRCm39-2024-A \
–fastqs ../1.raw-data/ \
–localcores 15 \
–sample ${i} \
–chemistry SC3Pv3 \
–create-bam true \
–nosecondary
done
每个样本文件夹下的 outs 文件夹就是输出结果:
下游分析
标准 seurat 流程,批量读取表达矩阵:
library(Seurat)
samples <- c(“NCD_rep1″,”JQF_rep1″,”HFD_rep1″,”HFD_rep2”)
create SeuratObject
x = 1
lapply(seq_along(samples), function(x){
path <- paste(“~/junjun_proj/17.sc_test/GSE270583-proj/2.count-data/”,
samples[x],
“/outs/filtered_feature_bc_matrix”,sep = “”)
tmp <- Read10X(path)
sct <- CreateSeuratObject(counts = tmp,
project = samples[x],
min.cells = 5,
min.features = 200)
# calculate mito
sct[[“percent.mt”]] <- PercentageFeatureSet(sct, pattern = “mt-“)
return(sct)
}) -> sct.list
combine list into object
sct.all <- merge(x = sct.list[[1]],
y = sct.list[-1],
add.cell.ids = samples)
sct.all
An object of class Seurat
23474 features across 32392 samples within 1 assay
Active assay: RNA (23474 features, 0 variable features)
4 layers present: counts.NCD_rep1, counts.JQF_rep1, counts.HFD_rep1, counts.HFD_rep2
简单质控:
Visualize mt
VlnPlot(sct.all, features = c(“nFeature_RNA”, “nCount_RNA”,”percent.mt”),
group.by = “orig.ident”,pt.size = 0)
过滤细胞,然后标准化:
filter cells
sct.ft <- subset(sct.all, percent.mt < 5 & nFeature_RNA < 7500)
sct.ft <- SCTransform(object = sct.ft,
variable.features.n = 3000,
vars.to.regress = “percent.mt”)
DefaultAssay(sct.ft)
[1] “SCT”
saveRDS(sct.ft,file = “sct.ft.rda”)
接着降维、聚类和分群:
sct.ft <- RunPCA(sct.ft,features = VariableFeatures(object = sct.ft))
sct.ft <- FindNeighbors(sct.ft, reduction = “pca”, dims = 1:30)
multiple resolution
for (res in seq(0.1,1,by = 0.1)) {
sct.ft <- FindClusters(sct.ft, resolution = res)
}
library(clustree)
clustree::clustree(sct.ft, prefix = ‘SCT_snn_res.’)
使用 0.1 的分辨率来分群:
sct.ft <- FindClusters(sct.ft, resolution = 0.1,
cluster.name = “unintegrated_clusters”)
sct.ft <- RunUMAP(sct.ft,dims = 1:30,
reduction = “pca”,
reduction.name = “umap.unintegrated”)
p1 <- DimPlot(sct.ft,
reduction = “umap.unintegrated”,
group.by = c(“orig.ident”, “unintegrated_clusters”))
p1
看着有一些批次效应,下面进行去除:
after harmony intergation
sct.la <- IntegrateLayers(object = sct.ft,
method = HarmonyIntegration,
orig.reduction = “pca”,
new.reduction = “harmony”,
normalization.method = “SCT”,
verbose = F)
sct.la <- FindNeighbors(sct.la,
reduction = “harmony”,
dims = 1:30)
sct.la <- FindClusters(sct.la,
resolution = 0.1,
cluster.name = “harmony_clusters”)
sct.la <- RunUMAP(sct.la,dims = 1:30,
reduction = “harmony”,
reduction.name = “umap.harmony”)
p2 <- DimPlot(sct.la,
reduction = “umap.harmony”,
group.by = c(“orig.ident”, “harmony_clusters”))
p1 / p2
寻找 marker 基因:
find markers
DefaultAssay(sct.la) <- “RNA”
sct.la <- JoinLayers(sct.la)
DefaultAssay(sct.la) <- “SCT”
sct.la <- PrepSCTFindMarkers(sct.la, assay = “SCT”, verbose = TRUE)
sct.markers <- SeuratWrappers::RunPrestoAll(object = sct.la,
only.pos = T,
logfc.threshold = 0.25,
assay = “SCT”,
test.use = “wilcox”,
min.pct = 0.25)
save(sct.markers, file = “sct.markers.rda”)
library(dplyr)
get top markers
top.markers <- sct.markers %>%
dplyr::group_by(cluster) %>%
dplyr::filter(avg_log2FC > 0.25 & p_val_adj < 0.05) %>%
top_n(n = 5,wt = avg_log2FC)
DoHeatmap(sct.la, features = top.markers$gene) +
NoLegend() +
scale_fill_gradient2(low = “blue”,mid = “white”,high = “red”,
midpoint = 0)
dotplot 看看:
DotPlot(sct.la,features = top.markers$gene) +
coord_flip()
亚群注释,这里使用 easybio 包自动注释看看:
cell annotate
library(easybio)
table(sct.la$harmony_clusters)
marker <- matchCellMarker2(marker = sct.markers,
n = 50,
spc = ‘Mouse’,
tissueType = “Brain”)[, head(.SD, 3), by = cluster]
cl2cell <- marker[, head(.SD, 1), by = .(cluster)]
cl2cell <- setNames(cl2cell[[“cell_name”]], cl2cell[[“cluster”]])
add celltype
sct.la <- RenameIdents(sct.la, cl2cell)
DimPlot(sct.la,
reduction = “umap.harmony”,
label = TRUE, pt.size = 0.5) +
NoLegend()
save
saveRDS(sct.la, file = “sct.la.anno.rds”)
结尾
路漫漫其修远兮,吾将上下而求索。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊老俊俊生信交流群(微信交流群需收取 20 元入群费用,一旦交费,拒不退还!(防止骗子和便于管理)) 。
声明:来自老俊俊的生信笔记,仅代表创作者观点。链接:https://eyangzhen.com/3660.html