install.packages(“devtools”)

devtools::install_github(“junjunlab/RiboProfiler”)

or

remotes::install_github(“junjunlab/RiboProfiler”)

library(RiboProfiler)
使用

下载基因组和注释文件：
wget https://ftp.ensembl.org/pub/release-112/fasta/saccharomyces_cerevisiae/dna/Saccharomyces_ceree.R64-1-1.dna.toplevel.fa.gz
wget https://ftp.ensembl.org/pub/release-112/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.112.gtf.gz
pre_longest_trans_info
提取最长转录本信息：
pre_longest_trans_info(gtf_file = “Saccharomyces_cerevisiae.R64-1-1.112.gtf”,
out_file = “longest_info.txt”)
读取进来看看：
df <- read.delim(“longest_info.txt”,header = F)

add colnames

colnames(df) <- c(“id”,”gene_name”,”gene_id”,”trans_id”,”chrom”,”strand”,
“cds_rg”,”exon_rg”,”5UTR_length”,”CDS_length”,”3UTR_length”)

❝
包括了 utr，cds 的长度，外显子位置等等信息。因为使用的是酵母物种，可以发现大部分基因是没有 utr 区域的。如果有时候分析可能需要对这样的情况进行拓展。这时候我们可以使用 gene_anno_extend 函数对 CDS 两端进行拓展，来增加 UTR 区域的长度，你可以拓展到指定长度，也可以增加一定的长度。
gene_anno_extend
函数参数：
gene_anno_extend <- function(longest_trans_file = NULL,
upstream_scale = NULL,
upstream_extend = NULL,
downstream_scale = NULL,
downstream_extend = NULL,
output_file = NULL)
对 CDS 上游增加 50nt，注意：对应的外显子区域的位置也一起变化了：
df_extend <- gene_anno_extend(longest_trans_file = “longest_info.txt”,
upstream_extend = 50)

一些有 utr 的基因就是增加了这些拓展的长度：

对上游拓展到 50nt（所有 utr 都增加至 50nt），下游拓展 100nt：
df_extend <- gene_anno_extend(longest_trans_file = “longest_info.txt”,
upstream_scale = 50,
downstream_extend = 100)

FetchSequence
❝
有了最长转录本信息，接着就可以提取序列了。
FetchSequence <- function(gene_file = NULL,
genome_file = NULL,
output_file = NULL,
type = c(“cds”,”exon”),
coding_type = c(“NT”,”AA”),
pythonPath = NULL,
table = 1)
提取开放阅读框的序列：
FetchSequence(gene_file = “longest_info.txt”,
genome_file = “./Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa”,
output_file = “sac_cds.fa”,
type = “cds”,
coding_type = “NT”)

提取整个转录本序列：
FetchSequence(gene_file = “longest_info.txt”,
genome_file = “./Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa”,
output_file = “sac_cds_aa.fa”,
type = “cds”,
coding_type = “AA”,
table = 1)

结尾

❝
路漫漫其修远兮,吾将上下而求索。
欢迎加入生信交流群。加我微信我也拉你进微信群聊老俊俊生信交流群 (微信交流群需收取 20 元入群费用,一旦交费,拒不退还!(防止骗子和便于管理)) 。QQ 群可免费加入, 记得进群按格式修改备注哦。

声明：文中观点不代表本站立场。本文传送门：https://eyangzhen.com/420418.html

提取最长转录本信息+exon 序列+CDS 序列+氨基酸序列

install.packages(“devtools”)

or

add colnames

作者专栏

老俊俊的生信笔记