omicScope 对接 UCSC xena 下载数据

引言
有时候大家可能会直接去 UCSC xena 数据库下载对应癌种表达数据自学分析和绘图。这也是需要较为繁琐的代码操作。omicScope 直接可以整合分析这些数据,一键分析和绘图。
安装

Install from GitHub

if (!requireNamespace(“devtools”, quietly = TRUE)) {
install.packages(“devtools”)
}
devtools::install_github(“junjunlab/omicScope”)

Or using pak (recommended)

if (!requireNamespace(“pak”, quietly = TRUE)) {
install.packages(“pak”)
}
pak::pak(“junjunlab/omicScope”)
示例
这里我们去UCSC xena 数据库下载LUAD的counts 定量数据,临床样本数据和生存数据,此外去GTEx数据库下载后肺的正常组织的测序数据:

Counts

wget https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz

Phenotype data

wget https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.clinical.tsv.gz

Survival data

wget https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.survival.tsv.gz

wget https://storage.googleapis.com/adult-gtex/bulk-gex/v10/rna-seq/counts-by-tissue/gene_reads_v10_lung.gct.gz
构建 omicscope 对象
obj <- ucscZenaToObj(gtf_anno = “gencode.v36.annotation.gtf.gz”,
counts_data = “TCGA-LUAD.star_counts.tsv.gz”,
pheno_data = “TCGA-LUAD.clinical.tsv.gz”,
survival_data = “TCGA-LUAD.survival.tsv.gz”,
gtex_counts_data = “gene_reads_v10_lung.gct.gz”)

obj

class: omicscope

dim: 52739 1193

metadata(0):

assays(1): counts

rownames(52739): ENSG00000223972.5 ENSG00000227232.5 … ENSG00000210195.2 ENSG00000210196.2

rowData names(3): gene_id gene_name gene_biotype

colnames(1193): TCGA-38-7271-01A TCGA-55-7914-01A … GTEX-ZZPT-1326-SM-5E43H GTEX-ZZPU-0526-SM-5E44U

colData names(97): sample id … X_PATIENT group2

数据标准化和降维分析:
obj <- normalize_data(obj, norm_type = “tpm”)

obj <- run_reduction(object = obj,
reduction = “pca”,
top_hvg_genes = 3000)

obj <- run_reduction(object = obj,
reduction = “umap”,
top_hvg_genes = 3000)

obj <- run_reduction(object = obj,
reduction = “tsne”,
top_hvg_genes = 3000)

p1 <- dim_plot(obj, reduction = “pca”, color_by = “group2”)
p2 <- dim_plot(obj, reduction = “umap”, color_by = “group2”)
p3 <- dim_plot(obj, reduction = “tsne”, color_by = “group2”)

cowplot::plot_grid(plotlist = list(p1,p2,p3),
ncol = 2)

批次效应去除
可以看到 GTEx 的正常组织样本有明显的批次效应,需要去除批次效应:
obj <- correct_batch_effect(obj)

Found 2 batches

Using full model in ComBat-seq.

Adjusting for 1 covariate(s) or covariate level(s)

Estimating dispersions

Fitting the GLM model

Shrinkage off – using GLM estimates for parameters

Adjusting the data

obj <- normalize_data(obj, norm_type = “tpm”)

obj <- run_reduction(object = obj,
reduction = “pca”,
top_hvg_genes = 3000)

obj <- run_reduction(object = obj,
reduction = “umap”,
top_hvg_genes = 3000)

obj <- run_reduction(object = obj,
reduction = “tsne”,
top_hvg_genes = 3000)

p1 <- dim_plot(obj, reduction = “pca”, color_by = “group2”)
p2 <- dim_plot(obj, reduction = “umap”, color_by = “group2”)
p3 <- dim_plot(obj, reduction = “tsne”, color_by = “group2”)

cowplot::plot_grid(plotlist = list(p1,p2,p3),
ncol = 2)
基本上癌症数据里的正常样本和 GTEx 的聚在一起了:

差异表达分析
obj <- differential_expression(obj,
method = “limma”,
limmaApproach = “voom”)

diff <- obj@diffExpData$limma$treat_vs_control

str(diff)

Formal class ‘diffdata’ [package “omicScope”] with 9 slots

..@ contrastName : chr “treat_vs_control”

..@ method : chr “limma”

..@ design :List of 1

.. ..$ : num [1:589, 1:2] 1 1 1 1 1 1 1 1 1 1 …

.. .. ..- attr(*, “dimnames”)=List of 2

.. .. .. ..$ : chr [1:589] “TCGA-38-7271-01A” “TCGA-55-7914-01A” “TCGA-95-7043-01A” “TCGA-73-4658-01A” …

.. .. .. ..$ : chr [1:2] “(Intercept)” “groupTumor”

.. .. ..- attr(*, “assign”)= int [1:2] 0 1

.. .. ..- attr(*, “contrasts”)=List of 1

.. .. .. ..$ group: chr “contr.treatment”

..@ log2FCthreshold: num 1

..@ pvalueThreshold: num 0.05

..@ sigUp : int 3973

..@ sigDown : int 3955

..@ nonSig : int 52732

..@ data :’data.frame’: 60660 obs. of 10 variables:

.. ..$ gene_id : chr [1:60660] “ENSG00000000003.15” “ENSG00000000005.6” “ENSG00000000419.13” “ENSG00000000457.14” …

.. ..$ log2FoldChange: num [1:60660] 1.1 -0.429 0.296 0.526 1.678 …

.. ..$ AveExpr : num [1:60660] 5.87 -4.76 4.99 3.99 2.82 …

.. ..$ t : num [1:60660] 9.26 -1.37 3.97 7.06 13.32 …

.. ..$ pvalue : num [1:60660] 3.86e-19 1.70e-01 8.08e-05 4.80e-12 1.39e-35 …

.. ..$ padj : num [1:60660] 7.05e-18 2.33e-01 3.16e-04 4.65e-11 7.96e-34 …

.. ..$ B : num [1:60660] 32.26 -6.11 0.07 16.3 69.99 …

.. ..$ gene_name : chr [1:60660] “TSPAN6” “TNMD” “DPM1” “SCYL3” …

.. ..$ gene_biotype : chr [1:60660] “protein_coding” “protein_coding” “protein_coding” “protein_coding” …

.. ..$ type : chr [1:60660] “sigUp” “nonSig” “nonSig” “nonSig” …

Check

diff@data |> head()

gene_id log2FoldChange AveExpr t pvalue padj B gene_name gene_biotype type

1 ENSG00000000003.15 1.0997480 5.872692 9.255036 3.862856e-19 7.051485e-18 32.25994586 TSPAN6 protein_coding sigUp

2 ENSG00000000005.6 -0.4286831 -4.762921 -1.374252 1.698829e-01 2.329236e-01 -6.10668869 TNMD protein_coding nonSig

3 ENSG00000000419.13 0.2959509 4.987883 3.969822 8.075471e-05 3.155024e-04 0.07004036 DPM1 protein_coding nonSig

4 ENSG00000000457.14 0.5261200 3.994452 7.056095 4.798641e-12 4.649187e-11 16.30391548 SCYL3 protein_coding nonSig

5 ENSG00000000460.17 1.6777640 2.815088 13.316756 1.391922e-35 7.957963e-34 69.98860263 C1orf112 protein_coding sigUp

6 ENSG00000000938.13 -2.2200498 4.360530 -18.605462 3.380108e-61 6.571710e-59 128.43374484 FGR protein_coding sigDown

可视化差异结果:
volcano_plot(object = obj, method = “limma”)

对特定的基于热图展示表达量变化
obj <- get_normalized_data(obj)

dif.res <- diff@data

gene <- subset(dif.res, startsWith(gene_name,”METTL”))

Plot

exp_heatmap_plot(obj,
selected_gene = gene$gene_name)

取消样本聚类:
exp_heatmap_plot(obj,
selected_gene = gene$gene_name,
complexHeatmap_params = list(cluster_rows = FALSE))

箱线图展示:
gene_boxViolin_plot(obj,
selected_gene = c(“METTL3″,”METTL14”,
“YTHDC1″,”YTHDC2”,
“YTHDF1″,”YTHDF2″,”YTHDF3”),
point_col = “grey50”,
point_alpha = 0.3)

小提琴图:
gene_boxViolin_plot(obj,
selected_gene = c(“METTL3″,”METTL14”,
“YTHDC1″,”YTHDC2”,
“YTHDF1″,”YTHDF2″,”YTHDF3”),
type = “violin”,
jitter.width = 0.15,
point_col = “grey50”,
point_alpha = 0.3)

基因相关性分析
p1 <- gene_cor_plot(obj,
gene_1 = “METTL3”,
gene_2 = “YTHDF2”,
selected_group = “Tumor”)

p2 <- gene_cor_plot(obj,
gene_1 = “METTL3”,
gene_2 = “YTHDF2”,
selected_group = “Normal”)

p1 + p2

生存预后分析
surv_plot(obj,
selected_gene = c(“YTHDF1″,”YTHDF2″,”YTHDF3″,”METTL5”))

肿瘤微环境分析
obj <- run_TME_analysis(obj,
method = “deconvo”,
deconvo_method = “cibersort”)

==========================================================================

IOBR v0.99.99 Immuno-Oncology Biological Research

For Tutorial: https://iobr.github.io/book/

For Help: https://github.com/IOBR/IOBR/issues

#

If you use IOBR in published research, please cite:

DQ Zeng, YR Fang, …, GC Yu, WJ Liao,

Enhancing immuno-oncology investigations through multidimensional decoding

of tumor microenvironment with IOBR 2.0. Cell Rep Methods 4, 100910 (2024).

&

YR Fang, …, WJ Liao, DQ Zeng,

Systematic Investigation of Tumor Microenvironment and

Antitumor Immunity With IOBR, Med Research (2025).

https://onlinelibrary.wiley.com/doi/epdf/10.1002/mdr2.70001

==========================================================================

#

>>> Running CIBERSORT

obj <- run_TME_analysis(obj,
method = “signature”,
signature_method = “pca”)

>>> Calculating signature score using PCA method

>>> log2 transformation was finished

There were 50 or more warnings (use warnings() to see the first 50)

结果可视化:
tme_plot(obj,
method = “deconvo”,
type = “box”)

热图:
tme_plot(obj,
method = “deconvo”,
type = “heatmap”)

对接 TCGAbiolinks
TCGAbiolinks 下载表达谱示例代码:

BiocManager::install(“TCGAbiolinks”)

library(TCGAbiolinks)
library(SummarizedExperiment)

query.exp <- GDCquery(
project = “TCGA-LUAD”,
data.category = “Transcriptome Profiling”,
data.type = “Gene Expression Quantification”,
workflow.type = “STAR – Counts”
)

GDCdownload(
query = query.exp,
files.per.chunk = 200
)

luad.exp <- GDCprepare(
query = query.exp,
save = TRUE,
save.filename = “luadExp.rda”
)
然后就可以构建 omicscope 对象,也可以支持整合 GTEx 样本数据:
library(omicScope)

load(“luadExp.rda”)

obj <- TCGAbiolinksToObj(tcgabiolinks_obj = data)

obj <- TCGAbiolinksToObj(gtf_anno = “gencode.v36.annotation.gtf.gz”,
tcgabiolinks_obj = data,
gtex_counts_data = “gene_reads_v10_lung.gct.gz”)
obj

class: omicscope

dim: 52739 1204

metadata(0):

assays(1): counts

rownames(52739): ENSG00000223972.5 ENSG00000227232.5 … ENSG00000210195.2 ENSG00000210196.2

rowData names(3): gene_id gene_name gene_biotype

colnames(1204): TCGA-44-6147-01B-06R-A277-07 TCGA-44-6147-01A-11R-1755-07 …

GTEX-ZZPT-1326-SM-5E43H GTEX-ZZPU-0526-SM-5E44U

colData names(99): sample patient … OS group2

下面就是差不多的分析了,大家可以自行探索。
结尾
路漫漫其修远兮,吾将上下而求索。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊老俊俊生信交流群(微信交流群需收取 20 元入群费用,一旦交费,拒不退还!(防止骗子和便于管理)) 。

声明:来自老俊俊的生信笔记,仅代表创作者观点。链接:https://eyangzhen.com/3581.html

老俊俊的生信笔记的头像老俊俊的生信笔记

相关推荐

关注我们
关注我们
购买服务
购买服务
返回顶部