引言
还是用这篇 Molecular Cell 文章酵母的 EIF5A 的文章数据。关于暂停位点得分的计算参考这篇 RNA 期刊的文章 《PausePred and Rfeet: webtools for inferring ribosome pauses and visualizing footprint density from ribosome profiling data》。相关的方法整合进 riboTransVis 包里。
数据文章:
计算方法文章:
公式:
Si=n2ri∑i=1nri+∑i=n23n2ri∑i=1nri∑i=n23n2ri.
具体介绍可以参考这篇 寻找核糖体暂停位点。
安装
install.packages(“devtools”)
devtools::install_github(“junjunlab/riboTransVis”)
or
remotes::install_github(“junjunlab/riboTransVis”)
构建 ribotrans 对象
使用比对好的 bam 文件作为输入,构建对象,这里是两个敲减 EIF5A 和两个野生型的四个样本:
sp <- c(“wt-rep1″,”wt-rep2″,”sgeIF5A-rep1″,”sgeIF5A-rep2”)
gp <- c(“wt”,”wt”,”sgeIF5A”,”sgeIF5A”)
ribobams <- c(“WT.1.sorted.bam”,”WT.2.sorted.bam”,”eIF5Ad.1.sorted.bam”,”eIF5Ad.2.sorted.bam”)
obj0 <- construct_ribotrans(genome_file = “../../index-data/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa”,
gtf_file = “../../index-data/Saccharomyces_cerevisiae.R64-1-1.112.gtf”,
mapping_type = “genome”,
assignment_mode = “end5”,
extend = TRUE,
extend_upstream = 50,
extend_downstream = 50,
Ribo_bam_file = ribobams,
Ribo_sample_name = sp,
RNA_sample_group = gp,
choose_longest_trans = T)
generate summary data for QC or other analysis
obj0 <- generate_summary(object = obj0, exp_type = “ribo”, nThreads = 40)
head(obj0@summary_info)
rname pos qwidth count sample sample_group mstart mstop translen
1 YAL067C_mRNA|SEO1 1774 27 1 wt-rep1 wt-rep1 51 1829 1882
2 YAL067C_mRNA|SEO1 1771 27 1 wt-rep1 wt-rep1 51 1829 1882
3 YAL067C_mRNA|SEO1 1766 20 1 wt-rep1 wt-rep1 51 1829 1882
4 YAL067C_mRNA|SEO1 1761 27 1 wt-rep1 wt-rep1 51 1829 1882
5 YAL067C_mRNA|SEO1 1662 28 1 wt-rep1 wt-rep1 51 1829 1882
6 YAL067C_mRNA|SEO1 1632 25 1 wt-rep1 wt-rep1 51 1829 1882
一些质控:
library(ggplot2)
qc
length_plot(obj0)
metagene_plot(object = obj0,
mode = “codon”,
rel2st_dist = c(-50, 900))
矫正偏移
这里使用-12nt,便宜到 P 位点(EPA):
offset <- rbind(data.frame(list(sample = “sgeIF5A-rep1”,
qwidth = 25:32,
rel_pos = -12)),
data.frame(list(sample = “sgeIF5A-rep2”,
qwidth = 25:32,
rel_pos = -12)),
data.frame(list(sample = “wt-rep1”,
qwidth = 25:32,
rel_pos = -12)),
data.frame(list(sample = “wt-rep2”,
qwidth = 25:32,
rel_pos = -12))
)
obj0@reads_offset_info <- offset
计算暂停得分
get_pausing_sites 函数来计算,这里选择 replicate 1 样本来计算:
pause site
get pause site
ps <- get_pausing_sites(object = obj0,
do_offset_correct = T,
sample_selected = c(“wt-rep1″,”sgeIF5A-rep1”))
check
head(ps)
# A tibble: 6 × 16
# Groups: rname, sample [1]
rname relst sample normsm total_counts sum_1 sum_2 pause_score total_sum min_sum max_sum balance_score imbalance_ratio pause_mean
1 Q0250… 1 sgeIF… 0 0 56.2 174. 0 230. 56.2 174. 0.244 3.1 1.40
2 Q0250… 2 sgeIF… 0 0 56.2 174. 0 230. 56.2 174. 0.244 3.1 1.40
3 Q0250… 3 sgeIF… 0 0 56.2 174. 0 230. 56.2 174. 0.244 3.1 1.40
4 Q0250… 4 sgeIF… 0 0 56.2 174. 0 230. 56.2 174. 0.244 3.1 1.40
5 Q0250… 5 sgeIF… 0 0 56.2 174. 0 230. 56.2 174. 0.244 3.1 1.40
6 Q0250… 6 sgeIF… 0 0 56.2 174. 0 230. 56.2 174. 0.244 3.1 1.40
# ℹ 2 more variables: pause_sd , z_score
plot
ps.ft <- ps %>%
dplyr::filter(pause_score > 20 & z_score > 0)
check
head(ps.ft)
# A tibble: 6 × 16
# Groups: rname, sample [1]
rname relst sample normsm total_counts sum_1 sum_2 pause_score total_sum min_sum max_sum balance_score imbalance_ratio pause_mean
1 Q0250… 559 sgeIF… 5.62 134 191. 28.1 22.9 219. 28.1 191. 0.128 6.8 1.40
2 Q0250… 562 sgeIF… 16.9 134 185. 28.1 69.1 214. 28.1 185. 0.132 6.6 1.40
3 Q0250… 568 sgeIF… 11.2 268 169. 28.1 46.7 197. 28.1 169. 0.143 6 1.40
4 Q0250… 588 sgeIF… 11.2 268 157. 22.5 57.1 180. 22.5 157. 0.125 7 1.40
5 Q0250… 591 sgeIF… 5.62 134 146. 16.9 37.2 163. 16.9 146. 0.103 8.67 1.40
6 Q0250… 593 sgeIF… 16.9 268 140. 16.9 112 157. 16.9 140. 0.107 8.33 1.40
# ℹ 2 more variables: pause_sd , z_score
原文展示了 3 个基因发生了明显的暂停:
我们复现这几个看看,然后把 pause_score > 20 的位置也标出来:
library(ggplot2)
examples
ps.ft.pt <- ps %>%
dplyr::filter(rname %in% c(“YGL206C_mRNA|CHC1″,”YER063W_mRNA|THO1”,
“YER059W_mRNA|PCL6”))
id <- unique(ps.ft.pt$rname)
loop plot
x = 1
lapply(seq_along(id),function(x){
tmp <- subset(ps.ft.pt, rname == id[x]) tmp.ps <- subset(ps.ft.pt, rname == id[x]) %>%
dplyr::filter(pause_score > 20 & z_score > 0)
ggplot(tmp) +
geom_col(aes(x = relst,y = normsm),fill = “#1A2A80”,color = “#1A2A80”) +
geom_point(data = tmp.ps,aes(x = relst, y = 0),
color = “black”,fill = “#FFD700”,
shape = 21,size = 3) +
# facet_wrap(~sample,scales = “free”) +
ggh4x::facet_grid2(sample~rname,scales = “fixed”) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour = “black”),
aspect.ratio = 0.6,
strip.text = element_text(face = “bold”,size = rel(1))) +
xlab(“Position along CDS (nt)”) +
ylab(“Ribosome density”)
}) -> p
cowplot::plot_grid(plotlist = p,nrow = 1)
可以看到位点还是很符合的。再挑几个看看:
examples
ps.ft.pt <- ps %>%
dplyr::filter(rname %in% c(“YAL001C_mRNA|TFC3″,”YGR097W_mRNA|ASK10”,
“YLR339C_mRNA|YLR339C_mRNA”,”YDR154C_mRNA|YDR154C_mRNA”))
id <- unique(ps.ft.pt$rname)
loop plot
x = 1
lapply(seq_along(id),function(x){
tmp <- subset(ps.ft.pt, rname == id[x]) tmp.ps <- subset(ps.ft.pt, rname == id[x]) %>%
dplyr::filter(pause_score > 20)
ggplot(tmp) +
geom_col(aes(x = relst,y = normsm),fill = “#1A2A80”,color = “#1A2A80”) +
geom_point(data = tmp.ps,aes(x = relst, y = 0),
color = “black”,fill = “#FFD700”,
shape = 21,size = 3) +
# facet_wrap(~sample,scales = “free”) +
ggh4x::facet_grid2(sample~rname,scales = “fixed”) +
theme(panel.grid = element_blank(),
axis.text = element_text(colour = “black”),
aspect.ratio = 0.6,
strip.text = element_text(face = “bold”,size = rel(1))) +
xlab(“Position along CDS (nt)”) +
ylab(“Ribosome density”)
}) -> p
cowplot::plot_grid(plotlist = p,nrow = 1)
暂停基序分析
暂停位点往往和附件序列的特征有关,比如不同极性的氨基酸性质,我们可以提取暂停位点附件的基序来看附件的序列特征。
这里我们提取每个基因前 3 的暂停位点,然后与野生型比较,筛选敲低 EIF5A 上调的暂停位点信息:
long to wide
ps.wd <- ps %>%
dplyr::select(rname,relst,sample,z_score,pause_score) %>%
tidyr::pivot_wider(names_from = sample,values_from = c(z_score,pause_score))
filter top pause site for each gene and up regulated sites
ps.wd.ft <- ps.wd %>%
dplyr::filter(pause_score_sgeIF5A-rep1
> 0 & pause_score_wt-rep1
> 0) %>%
dplyr::mutate(ps_ratio = log2(pause_score_sgeIF5A-rep1
/pause_score_wt-rep1
),
zs_ratio = log2(z_score_sgeIF5A-rep1
/z_score_wt-rep1
)) %>%
dplyr::filter(ps_ratio > 1 & zs_ratio > 1) %>%
dplyr::ungroup() %>%
dplyr::slice_max(order_by = ps_ratio,n = 3,by = rname)
check
head(ps.wd.ft)
# A tibble: 6 × 8
rname relst z_score_sgeIF5A-rep1
z_score_wt-rep1
pause_score_sgeIF5A-rep…¹ pause_score_wt-rep1
ps_ratio zs_ratio
1 YAL001C_mRNA|TFC3 1575 18.1 1.70 103. 6.58 3.96 3.42
2 YAL001C_mRNA|TFC3 21 13.3 1.28 75.4 5.24 3.85 3.37
3 YAL001C_mRNA|TFC3 1758 8.49 0.895 48.7 3.99 3.61 3.25
4 YAL002W_mRNA|VPS8 144 3.57 0.402 18.6 2.76 2.75 3.15
5 YAL002W_mRNA|VPS8 2076 10.6 1.87 53 9 2.56 2.51
6 YAL002W_mRNA|VPS8 729 6.66 1.36 33.7 6.83 2.30 2.29
# ℹ abbreviated name: ¹pause_score_sgeIF5A-rep1
get_codon_from_site 函数可以从上面筛选的暂停位点信息,提取周围的氨基酸序列,left_extend 和 right_extend 参数指定向上下游延伸几个氨基酸:
get codons around P-sie
pep.df <- get_codon_from_site(object = obj0,
cds_fa = “sac_cds.fa”,
pause_data = ps.wd.ft,
left_extend = 5,
right_extend = 5)
check
head(pep.df)
rname codon_pos lp rp len seqs
1 YAL001C_mRNA|TFC3 526 521 531 1161 KSAEDSPSSNG
2 YAL001C_mRNA|TFC3 8 3 13 1161 LTIYPDELVQI
3 YAL001C_mRNA|TFC3 587 582 592 1161 KYMGSTTTLDK
4 YAL002W_mRNA|VPS8 49 44 54 1275 YTAPPLNEDGP
5 YAL002W_mRNA|VPS8 693 688 698 1275 GFSIKWPSNSN
6 YAL002W_mRNA|VPS8 244 239 249 1275 KILDVNSSKEK
绘制 motif 特征,位置 5-6-7 对应 EPA 位点:
motif plot
logo_plot(foreground_seqs = pep.df$seqs,
method = “bits”)
logo_plot(foreground_seqs = pep.df$seqs,
method = “prob”)
logo_plot(foreground_seqs = pep.df$seqs,
method = “EDLogo”)
generate kmers
kmer <- generate_kmers(fa_file = “sac_cds.fa”,
fa_type = “dna”,
kmer_length = 11,
translate = T)
plot
logo_plot(foreground_seqs = pep.df$seqs,
background_seqs = kmer,
method = “enrich”)
三肽基序暂停得分
这个和上面的异曲同工,计算连续三个氨基酸的核糖体密度,比较不同样本间的差异。
原文:
multi_peptide_occupancy 计算多个连续氨基酸的核糖体密度:
peptide_df <- multi_peptide_occupancy(object = obj0,
cds_fa = “./sac_cds.fa”,
do_offset_correct = T,
peptide_length = 3)
pdf <- peptide_df$wider_format
head(pdf)
# A tibble: 6 × 5
pep_seq sgeIF5A-rep1
sgeIF5A-rep2
wt-rep1
wt-rep2
1 AAA 7.05 8.19 8.86 10.2
2 AAD 10.5 12.2 11.8 13.7
3 AAE 9.81 10.8 11.4 12.6
4 AAF 9.21 9.90 9.33 12.7
5 AAG 9.75 9.73 9.67 11.3
6 AAI 8.70 9.24 10.1 12.0
peptide_scatter_plot 绘图:
r1 <- peptide_scatter_plot(data = pdf,x = “wt-rep1”,y = “sgeIF5A-rep1”,
mark_motif = c(“DDP”,”GPP”,”DPP”,”PPP”,”PDK”,”PDG”,”PDP”,”DNP”,”DGP”,”SPP”))
r2 <- peptide_scatter_plot(data = pdf,x = “wt-rep2”,y = “sgeIF5A-rep2”,
mark_motif = c(“DDP”,”GPP”,”DPP”,”PPP”,”PDK”,”PDG”,”PDP”,”DNP”,”DGP”,”SPP”))
library(patchwork)
r1 + r2
结尾
路漫漫其修远兮,吾将上下而求索。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊老俊俊生信交流群(微信交流群需收取 20 元入群费用,一旦交费,拒不退还!(防止骗子和便于管理)) 。
声明:来自老俊俊的生信笔记,仅代表创作者观点。链接:https://eyangzhen.com/1940.html