Molecular Cell 文章复现:核糖体暂停位点计算

引言
还是用这篇 Molecular Cell 文章酵母的 EIF5A 的文章数据。关于暂停位点得分的计算参考这篇 RNA 期刊的文章 《PausePred and Rfeet: webtools for inferring ribosome pauses and visualizing footprint density from ribosome profiling data》。相关的方法整合进 riboTransVis 包里。
数据文章:

计算方法文章:

公式:
Si=n2⁢ri⁢∑i=1nri+∑i=n23⁢n2ri∑i=1nri⁢∑i=n23⁢n2ri.
具体介绍可以参考这篇 寻找核糖体暂停位点。
安装

install.packages(“devtools”)

devtools::install_github(“junjunlab/riboTransVis”)

or

remotes::install_github(“junjunlab/riboTransVis”)

构建 ribotrans 对象
使用比对好的 bam 文件作为输入,构建对象,这里是两个敲减 EIF5A 和两个野生型的四个样本:
sp <- c(“wt-rep1″,”wt-rep2″,”sgeIF5A-rep1″,”sgeIF5A-rep2”)
gp <- c(“wt”,”wt”,”sgeIF5A”,”sgeIF5A”)

ribobams <- c(“WT.1.sorted.bam”,”WT.2.sorted.bam”,”eIF5Ad.1.sorted.bam”,”eIF5Ad.2.sorted.bam”)

obj0 <- construct_ribotrans(genome_file = “../../index-data/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa”,
gtf_file = “../../index-data/Saccharomyces_cerevisiae.R64-1-1.112.gtf”,
mapping_type = “genome”,
assignment_mode = “end5”,
extend = TRUE,
extend_upstream = 50,
extend_downstream = 50,
Ribo_bam_file = ribobams,
Ribo_sample_name = sp,
RNA_sample_group = gp,
choose_longest_trans = T)

generate summary data for QC or other analysis

obj0 <- generate_summary(object = obj0, exp_type = “ribo”, nThreads = 40)

head(obj0@summary_info)

rname pos qwidth count sample sample_group mstart mstop translen

1 YAL067C_mRNA|SEO1 1774 27 1 wt-rep1 wt-rep1 51 1829 1882

2 YAL067C_mRNA|SEO1 1771 27 1 wt-rep1 wt-rep1 51 1829 1882

3 YAL067C_mRNA|SEO1 1766 20 1 wt-rep1 wt-rep1 51 1829 1882

4 YAL067C_mRNA|SEO1 1761 27 1 wt-rep1 wt-rep1 51 1829 1882

5 YAL067C_mRNA|SEO1 1662 28 1 wt-rep1 wt-rep1 51 1829 1882

6 YAL067C_mRNA|SEO1 1632 25 1 wt-rep1 wt-rep1 51 1829 1882

一些质控:
library(ggplot2)

qc

length_plot(obj0)

metagene_plot(object = obj0,
mode = “codon”,
rel2st_dist = c(-50, 900))

矫正偏移
这里使用-12nt,便宜到 P 位点(EPA):
offset <- rbind(data.frame(list(sample = “sgeIF5A-rep1”,
qwidth = 25:32,
rel_pos = -12)),
data.frame(list(sample = “sgeIF5A-rep2”,
qwidth = 25:32,
rel_pos = -12)),
data.frame(list(sample = “wt-rep1”,
qwidth = 25:32,
rel_pos = -12)),
data.frame(list(sample = “wt-rep2”,
qwidth = 25:32,
rel_pos = -12))
)

obj0@reads_offset_info <- offset
计算暂停得分
get_pausing_sites 函数来计算,这里选择 replicate 1 样本来计算:

pause site

get pause site

ps <- get_pausing_sites(object = obj0,
do_offset_correct = T,
sample_selected = c(“wt-rep1″,”sgeIF5A-rep1”))

check

head(ps)

# A tibble: 6 × 16

# Groups: rname, sample [1]

rname relst sample normsm total_counts sum_1 sum_2 pause_score total_sum min_sum max_sum balance_score imbalance_ratio pause_mean

1 Q0250… 1 sgeIF… 0 0 56.2 174. 0 230. 56.2 174. 0.244 3.1 1.40

2 Q0250… 2 sgeIF… 0 0 56.2 174. 0 230. 56.2 174. 0.244 3.1 1.40

3 Q0250… 3 sgeIF… 0 0 56.2 174. 0 230. 56.2 174. 0.244 3.1 1.40

4 Q0250… 4 sgeIF… 0 0 56.2 174. 0 230. 56.2 174. 0.244 3.1 1.40

5 Q0250… 5 sgeIF… 0 0 56.2 174. 0 230. 56.2 174. 0.244 3.1 1.40

6 Q0250… 6 sgeIF… 0 0 56.2 174. 0 230. 56.2 174. 0.244 3.1 1.40

# ℹ 2 more variables: pause_sd , z_score

plot

ps.ft <- ps %>%
dplyr::filter(pause_score > 20 & z_score > 0)

check

head(ps.ft)

# A tibble: 6 × 16

# Groups: rname, sample [1]

rname relst sample normsm total_counts sum_1 sum_2 pause_score total_sum min_sum max_sum balance_score imbalance_ratio pause_mean

1 Q0250… 559 sgeIF… 5.62 134 191. 28.1 22.9 219. 28.1 191. 0.128 6.8 1.40

2 Q0250… 562 sgeIF… 16.9 134 185. 28.1 69.1 214. 28.1 185. 0.132 6.6 1.40

3 Q0250… 568 sgeIF… 11.2 268 169. 28.1 46.7 197. 28.1 169. 0.143 6 1.40

4 Q0250… 588 sgeIF… 11.2 268 157. 22.5 57.1 180. 22.5 157. 0.125 7 1.40

5 Q0250… 591 sgeIF… 5.62 134 146. 16.9 37.2 163. 16.9 146. 0.103 8.67 1.40

6 Q0250… 593 sgeIF… 16.9 268 140. 16.9 112 157. 16.9 140. 0.107 8.33 1.40

# ℹ 2 more variables: pause_sd , z_score

原文展示了 3 个基因发生了明显的暂停:

我们复现这几个看看,然后把 pause_score > 20 的位置也标出来:
library(ggplot2)

examples

ps.ft.pt <- ps %>%
dplyr::filter(rname %in% c(“YGL206C_mRNA|CHC1″,”YER063W_mRNA|THO1”,
“YER059W_mRNA|PCL6”))

id <- unique(ps.ft.pt$rname)

loop plot

x = 1

lapply(seq_along(id),function(x){
tmp <- subset(ps.ft.pt, rname == id[x]) tmp.ps <- subset(ps.ft.pt, rname == id[x]) %>%
dplyr::filter(pause_score > 20 & z_score > 0)

ggplot(tmp) +
geom_col(aes(x = relst,y = normsm),fill = “#1A2A80”,color = “#1A2A80”) +
geom_point(data = tmp.ps,aes(x = relst, y = 0),
color = “black”,fill = “#FFD700”,
shape = 21,size = 3) +
# facet_wrap(~sample,scales = “free”) +
ggh4x::facet_grid2(sample~rname,scales = “fixed”) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour = “black”),
aspect.ratio = 0.6,
strip.text = element_text(face = “bold”,size = rel(1))) +
xlab(“Position along CDS (nt)”) +
ylab(“Ribosome density”)
}) -> p

cowplot::plot_grid(plotlist = p,nrow = 1)

可以看到位点还是很符合的。再挑几个看看:

examples

ps.ft.pt <- ps %>%
dplyr::filter(rname %in% c(“YAL001C_mRNA|TFC3″,”YGR097W_mRNA|ASK10”,
“YLR339C_mRNA|YLR339C_mRNA”,”YDR154C_mRNA|YDR154C_mRNA”))

id <- unique(ps.ft.pt$rname)

loop plot

x = 1

lapply(seq_along(id),function(x){
tmp <- subset(ps.ft.pt, rname == id[x]) tmp.ps <- subset(ps.ft.pt, rname == id[x]) %>%
dplyr::filter(pause_score > 20)

ggplot(tmp) +
geom_col(aes(x = relst,y = normsm),fill = “#1A2A80”,color = “#1A2A80”) +
geom_point(data = tmp.ps,aes(x = relst, y = 0),
color = “black”,fill = “#FFD700”,
shape = 21,size = 3) +
# facet_wrap(~sample,scales = “free”) +
ggh4x::facet_grid2(sample~rname,scales = “fixed”) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour = “black”),
aspect.ratio = 0.6,
strip.text = element_text(face = “bold”,size = rel(1))) +
xlab(“Position along CDS (nt)”) +
ylab(“Ribosome density”)
}) -> p

cowplot::plot_grid(plotlist = p,nrow = 1)

暂停基序分析
暂停位点往往和附件序列的特征有关,比如不同极性的氨基酸性质,我们可以提取暂停位点附件的基序来看附件的序列特征。
这里我们提取每个基因前 3 的暂停位点,然后与野生型比较,筛选敲低 EIF5A 上调的暂停位点信息:

long to wide

ps.wd <- ps %>%
dplyr::select(rname,relst,sample,z_score,pause_score) %>%
tidyr::pivot_wider(names_from = sample,values_from = c(z_score,pause_score))

filter top pause site for each gene and up regulated sites

ps.wd.ft <- ps.wd %>%
dplyr::filter(pause_score_sgeIF5A-rep1 > 0 & pause_score_wt-rep1 > 0) %>%
dplyr::mutate(ps_ratio = log2(pause_score_sgeIF5A-rep1/pause_score_wt-rep1),
zs_ratio = log2(z_score_sgeIF5A-rep1/z_score_wt-rep1)) %>%
dplyr::filter(ps_ratio > 1 & zs_ratio > 1) %>%
dplyr::ungroup() %>%
dplyr::slice_max(order_by = ps_ratio,n = 3,by = rname)

check

head(ps.wd.ft)

# A tibble: 6 × 8

rname relst z_score_sgeIF5A-rep1 z_score_wt-rep1 pause_score_sgeIF5A-rep…¹ pause_score_wt-rep1 ps_ratio zs_ratio

1 YAL001C_mRNA|TFC3 1575 18.1 1.70 103. 6.58 3.96 3.42

2 YAL001C_mRNA|TFC3 21 13.3 1.28 75.4 5.24 3.85 3.37

3 YAL001C_mRNA|TFC3 1758 8.49 0.895 48.7 3.99 3.61 3.25

4 YAL002W_mRNA|VPS8 144 3.57 0.402 18.6 2.76 2.75 3.15

5 YAL002W_mRNA|VPS8 2076 10.6 1.87 53 9 2.56 2.51

6 YAL002W_mRNA|VPS8 729 6.66 1.36 33.7 6.83 2.30 2.29

# ℹ abbreviated name: ¹pause_score_sgeIF5A-rep1

get_codon_from_site 函数可以从上面筛选的暂停位点信息,提取周围的氨基酸序列,left_extend 和 right_extend 参数指定向上下游延伸几个氨基酸:

get codons around P-sie

pep.df <- get_codon_from_site(object = obj0,
cds_fa = “sac_cds.fa”,
pause_data = ps.wd.ft,
left_extend = 5,
right_extend = 5)

check

head(pep.df)

rname codon_pos lp rp len seqs

1 YAL001C_mRNA|TFC3 526 521 531 1161 KSAEDSPSSNG

2 YAL001C_mRNA|TFC3 8 3 13 1161 LTIYPDELVQI

3 YAL001C_mRNA|TFC3 587 582 592 1161 KYMGSTTTLDK

4 YAL002W_mRNA|VPS8 49 44 54 1275 YTAPPLNEDGP

5 YAL002W_mRNA|VPS8 693 688 698 1275 GFSIKWPSNSN

6 YAL002W_mRNA|VPS8 244 239 249 1275 KILDVNSSKEK

绘制 motif 特征,位置 5-6-7 对应 EPA 位点:

motif plot

logo_plot(foreground_seqs = pep.df$seqs,
method = “bits”)

logo_plot(foreground_seqs = pep.df$seqs,
method = “prob”)

logo_plot(foreground_seqs = pep.df$seqs,
method = “EDLogo”)

generate kmers

kmer <- generate_kmers(fa_file = “sac_cds.fa”,
fa_type = “dna”,
kmer_length = 11,
translate = T)

plot

logo_plot(foreground_seqs = pep.df$seqs,
background_seqs = kmer,
method = “enrich”)

三肽基序暂停得分
这个和上面的异曲同工,计算连续三个氨基酸的核糖体密度,比较不同样本间的差异。
原文:

multi_peptide_occupancy 计算多个连续氨基酸的核糖体密度:
peptide_df <- multi_peptide_occupancy(object = obj0,
cds_fa = “./sac_cds.fa”,
do_offset_correct = T,
peptide_length = 3)

pdf <- peptide_df$wider_format

head(pdf)

# A tibble: 6 × 5

pep_seq sgeIF5A-rep1 sgeIF5A-rep2 wt-rep1 wt-rep2

1 AAA 7.05 8.19 8.86 10.2

2 AAD 10.5 12.2 11.8 13.7

3 AAE 9.81 10.8 11.4 12.6

4 AAF 9.21 9.90 9.33 12.7

5 AAG 9.75 9.73 9.67 11.3

6 AAI 8.70 9.24 10.1 12.0

peptide_scatter_plot 绘图:
r1 <- peptide_scatter_plot(data = pdf,x = “wt-rep1”,y = “sgeIF5A-rep1”,
mark_motif = c(“DDP”,”GPP”,”DPP”,”PPP”,”PDK”,”PDG”,”PDP”,”DNP”,”DGP”,”SPP”))
r2 <- peptide_scatter_plot(data = pdf,x = “wt-rep2”,y = “sgeIF5A-rep2”,
mark_motif = c(“DDP”,”GPP”,”DPP”,”PPP”,”PDK”,”PDG”,”PDP”,”DNP”,”DGP”,”SPP”))

library(patchwork)

r1 + r2

结尾
路漫漫其修远兮,吾将上下而求索。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊老俊俊生信交流群(微信交流群需收取 20 元入群费用,一旦交费,拒不退还!(防止骗子和便于管理)) 。

声明:来自老俊俊的生信笔记,仅代表创作者观点。链接:https://eyangzhen.com/1940.html

老俊俊的生信笔记的头像老俊俊的生信笔记

相关推荐

关注我们
关注我们
购买服务
购买服务
返回顶部