引言
有时候大家可能会直接去 UCSC xena 数据库下载对应癌种表达数据自学分析和绘图。这也是需要较为繁琐的代码操作。omicScope 直接可以整合分析这些数据,一键分析和绘图。
安装
Install from GitHub
if (!requireNamespace(“devtools”, quietly = TRUE)) {
install.packages(“devtools”)
}
devtools::install_github(“junjunlab/omicScope”)
Or using pak (recommended)
if (!requireNamespace(“pak”, quietly = TRUE)) {
install.packages(“pak”)
}
pak::pak(“junjunlab/omicScope”)
示例
这里我们去UCSC xena 数据库下载LUAD的counts 定量数据,临床样本数据和生存数据,此外去GTEx数据库下载后肺的正常组织的测序数据:
Counts
wget https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz
Phenotype data
wget https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.clinical.tsv.gz
Survival data
wget https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.survival.tsv.gz
wget https://storage.googleapis.com/adult-gtex/bulk-gex/v10/rna-seq/counts-by-tissue/gene_reads_v10_lung.gct.gz
构建 omicscope 对象
obj <- ucscZenaToObj(gtf_anno = “gencode.v36.annotation.gtf.gz”,
counts_data = “TCGA-LUAD.star_counts.tsv.gz”,
pheno_data = “TCGA-LUAD.clinical.tsv.gz”,
survival_data = “TCGA-LUAD.survival.tsv.gz”,
gtex_counts_data = “gene_reads_v10_lung.gct.gz”)
obj
class: omicscope
dim: 52739 1193
metadata(0):
assays(1): counts
rownames(52739): ENSG00000223972.5 ENSG00000227232.5 … ENSG00000210195.2 ENSG00000210196.2
rowData names(3): gene_id gene_name gene_biotype
colnames(1193): TCGA-38-7271-01A TCGA-55-7914-01A … GTEX-ZZPT-1326-SM-5E43H GTEX-ZZPU-0526-SM-5E44U
colData names(97): sample id … X_PATIENT group2
数据标准化和降维分析:
obj <- normalize_data(obj, norm_type = “tpm”)
obj <- run_reduction(object = obj,
reduction = “pca”,
top_hvg_genes = 3000)
obj <- run_reduction(object = obj,
reduction = “umap”,
top_hvg_genes = 3000)
obj <- run_reduction(object = obj,
reduction = “tsne”,
top_hvg_genes = 3000)
p1 <- dim_plot(obj, reduction = “pca”, color_by = “group2”)
p2 <- dim_plot(obj, reduction = “umap”, color_by = “group2”)
p3 <- dim_plot(obj, reduction = “tsne”, color_by = “group2”)
cowplot::plot_grid(plotlist = list(p1,p2,p3),
ncol = 2)
批次效应去除
可以看到 GTEx 的正常组织样本有明显的批次效应,需要去除批次效应:
obj <- correct_batch_effect(obj)
Found 2 batches
Using full model in ComBat-seq.
Adjusting for 1 covariate(s) or covariate level(s)
Estimating dispersions
Fitting the GLM model
Shrinkage off – using GLM estimates for parameters
Adjusting the data
obj <- normalize_data(obj, norm_type = “tpm”)
obj <- run_reduction(object = obj,
reduction = “pca”,
top_hvg_genes = 3000)
obj <- run_reduction(object = obj,
reduction = “umap”,
top_hvg_genes = 3000)
obj <- run_reduction(object = obj,
reduction = “tsne”,
top_hvg_genes = 3000)
p1 <- dim_plot(obj, reduction = “pca”, color_by = “group2”)
p2 <- dim_plot(obj, reduction = “umap”, color_by = “group2”)
p3 <- dim_plot(obj, reduction = “tsne”, color_by = “group2”)
cowplot::plot_grid(plotlist = list(p1,p2,p3),
ncol = 2)
基本上癌症数据里的正常样本和 GTEx 的聚在一起了:
差异表达分析
obj <- differential_expression(obj,
method = “limma”,
limmaApproach = “voom”)
diff <- obj@diffExpData$limma$treat_vs_control
str(diff)
Formal class ‘diffdata’ [package “omicScope”] with 9 slots
..@ contrastName : chr “treat_vs_control”
..@ method : chr “limma”
..@ design :List of 1
.. ..$ : num [1:589, 1:2] 1 1 1 1 1 1 1 1 1 1 …
.. .. ..- attr(*, “dimnames”)=List of 2
.. .. .. ..$ : chr [1:589] “TCGA-38-7271-01A” “TCGA-55-7914-01A” “TCGA-95-7043-01A” “TCGA-73-4658-01A” …
.. .. .. ..$ : chr [1:2] “(Intercept)” “groupTumor”
.. .. ..- attr(*, “assign”)= int [1:2] 0 1
.. .. ..- attr(*, “contrasts”)=List of 1
.. .. .. ..$ group: chr “contr.treatment”
..@ log2FCthreshold: num 1
..@ pvalueThreshold: num 0.05
..@ sigUp : int 3973
..@ sigDown : int 3955
..@ nonSig : int 52732
..@ data :’data.frame’: 60660 obs. of 10 variables:
.. ..$ gene_id : chr [1:60660] “ENSG00000000003.15” “ENSG00000000005.6” “ENSG00000000419.13” “ENSG00000000457.14” …
.. ..$ log2FoldChange: num [1:60660] 1.1 -0.429 0.296 0.526 1.678 …
.. ..$ AveExpr : num [1:60660] 5.87 -4.76 4.99 3.99 2.82 …
.. ..$ t : num [1:60660] 9.26 -1.37 3.97 7.06 13.32 …
.. ..$ pvalue : num [1:60660] 3.86e-19 1.70e-01 8.08e-05 4.80e-12 1.39e-35 …
.. ..$ padj : num [1:60660] 7.05e-18 2.33e-01 3.16e-04 4.65e-11 7.96e-34 …
.. ..$ B : num [1:60660] 32.26 -6.11 0.07 16.3 69.99 …
.. ..$ gene_name : chr [1:60660] “TSPAN6” “TNMD” “DPM1” “SCYL3” …
.. ..$ gene_biotype : chr [1:60660] “protein_coding” “protein_coding” “protein_coding” “protein_coding” …
.. ..$ type : chr [1:60660] “sigUp” “nonSig” “nonSig” “nonSig” …
Check
diff@data |> head()
gene_id log2FoldChange AveExpr t pvalue padj B gene_name gene_biotype type
1 ENSG00000000003.15 1.0997480 5.872692 9.255036 3.862856e-19 7.051485e-18 32.25994586 TSPAN6 protein_coding sigUp
2 ENSG00000000005.6 -0.4286831 -4.762921 -1.374252 1.698829e-01 2.329236e-01 -6.10668869 TNMD protein_coding nonSig
3 ENSG00000000419.13 0.2959509 4.987883 3.969822 8.075471e-05 3.155024e-04 0.07004036 DPM1 protein_coding nonSig
4 ENSG00000000457.14 0.5261200 3.994452 7.056095 4.798641e-12 4.649187e-11 16.30391548 SCYL3 protein_coding nonSig
5 ENSG00000000460.17 1.6777640 2.815088 13.316756 1.391922e-35 7.957963e-34 69.98860263 C1orf112 protein_coding sigUp
6 ENSG00000000938.13 -2.2200498 4.360530 -18.605462 3.380108e-61 6.571710e-59 128.43374484 FGR protein_coding sigDown
可视化差异结果:
volcano_plot(object = obj, method = “limma”)
对特定的基于热图展示表达量变化
obj <- get_normalized_data(obj)
dif.res <- diff@data
gene <- subset(dif.res, startsWith(gene_name,”METTL”))
Plot
exp_heatmap_plot(obj,
selected_gene = gene$gene_name)
取消样本聚类:
exp_heatmap_plot(obj,
selected_gene = gene$gene_name,
complexHeatmap_params = list(cluster_rows = FALSE))
箱线图展示:
gene_boxViolin_plot(obj,
selected_gene = c(“METTL3″,”METTL14”,
“YTHDC1″,”YTHDC2”,
“YTHDF1″,”YTHDF2″,”YTHDF3”),
point_col = “grey50”,
point_alpha = 0.3)
小提琴图:
gene_boxViolin_plot(obj,
selected_gene = c(“METTL3″,”METTL14”,
“YTHDC1″,”YTHDC2”,
“YTHDF1″,”YTHDF2″,”YTHDF3”),
type = “violin”,
jitter.width = 0.15,
point_col = “grey50”,
point_alpha = 0.3)
基因相关性分析
p1 <- gene_cor_plot(obj,
gene_1 = “METTL3”,
gene_2 = “YTHDF2”,
selected_group = “Tumor”)
p2 <- gene_cor_plot(obj,
gene_1 = “METTL3”,
gene_2 = “YTHDF2”,
selected_group = “Normal”)
p1 + p2
生存预后分析
surv_plot(obj,
selected_gene = c(“YTHDF1″,”YTHDF2″,”YTHDF3″,”METTL5”))
肿瘤微环境分析
obj <- run_TME_analysis(obj,
method = “deconvo”,
deconvo_method = “cibersort”)
==========================================================================
IOBR v0.99.99 Immuno-Oncology Biological Research
For Tutorial: https://iobr.github.io/book/
For Help: https://github.com/IOBR/IOBR/issues
#
If you use IOBR in published research, please cite:
DQ Zeng, YR Fang, …, GC Yu, WJ Liao,
Enhancing immuno-oncology investigations through multidimensional decoding
of tumor microenvironment with IOBR 2.0. Cell Rep Methods 4, 100910 (2024).
&
YR Fang, …, WJ Liao, DQ Zeng,
Systematic Investigation of Tumor Microenvironment and
Antitumor Immunity With IOBR, Med Research (2025).
https://onlinelibrary.wiley.com/doi/epdf/10.1002/mdr2.70001
==========================================================================
#
>>> Running CIBERSORT
obj <- run_TME_analysis(obj,
method = “signature”,
signature_method = “pca”)
>>> Calculating signature score using PCA method
>>> log2 transformation was finished
There were 50 or more warnings (use warnings() to see the first 50)
结果可视化:
tme_plot(obj,
method = “deconvo”,
type = “box”)
热图:
tme_plot(obj,
method = “deconvo”,
type = “heatmap”)
对接 TCGAbiolinks
TCGAbiolinks 下载表达谱示例代码:
BiocManager::install(“TCGAbiolinks”)
library(TCGAbiolinks)
library(SummarizedExperiment)
query.exp <- GDCquery(
project = “TCGA-LUAD”,
data.category = “Transcriptome Profiling”,
data.type = “Gene Expression Quantification”,
workflow.type = “STAR – Counts”
)
GDCdownload(
query = query.exp,
files.per.chunk = 200
)
luad.exp <- GDCprepare(
query = query.exp,
save = TRUE,
save.filename = “luadExp.rda”
)
然后就可以构建 omicscope 对象,也可以支持整合 GTEx 样本数据:
library(omicScope)
load(“luadExp.rda”)
obj <- TCGAbiolinksToObj(tcgabiolinks_obj = data)
obj <- TCGAbiolinksToObj(gtf_anno = “gencode.v36.annotation.gtf.gz”,
tcgabiolinks_obj = data,
gtex_counts_data = “gene_reads_v10_lung.gct.gz”)
obj
class: omicscope
dim: 52739 1204
metadata(0):
assays(1): counts
rownames(52739): ENSG00000223972.5 ENSG00000227232.5 … ENSG00000210195.2 ENSG00000210196.2
rowData names(3): gene_id gene_name gene_biotype
colnames(1204): TCGA-44-6147-01B-06R-A277-07 TCGA-44-6147-01A-11R-1755-07 …
GTEX-ZZPT-1326-SM-5E43H GTEX-ZZPU-0526-SM-5E44U
colData names(99): sample patient … OS group2
下面就是差不多的分析了,大家可以自行探索。
结尾
路漫漫其修远兮,吾将上下而求索。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊老俊俊生信交流群(微信交流群需收取 20 元入群费用,一旦交费,拒不退还!(防止骗子和便于管理)) 。
声明:来自老俊俊的生信笔记,仅代表创作者观点。链接:https://eyangzhen.com/3581.html