ncRNA-annotation


1.conda 环境配置

# conda env
conda create -n ncRNA
conda activate ncRNA

# download infernal
conda install -c bioconda infernal

2.注释 DB 配置

#  download Rfam data
cd /data/chaofan/source
mkdir 04.ncRNA && cd 04.ncRNA
wget ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz
gzip -d Rfam.cm.gz
wget ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.clanin

# cmpress 索引
cmpress Rfam.cm

# 从https://rfam.org/search/type下载相应RF-number对应的注释
# 地址为: /data/chaofan/source/04.ncRNA/RF.txt

3.开始注释 ncRNA

# 开始注释
cd /data/chaofan/projects/13.Alternaria_Z7/08.gaisen_genome/l0
cmscan -Z 66 --cut_ga --rfam --nohmmonly --tblout gaisen.asm.primer.tblout  \
  --fmt 2 --cpu 40 --clanin /data/chaofan/source/04.ncRNA/Rfam.clanin \
  /data/chaofan/source/04.ncRNA/Rfam.cm gaisen.asm.primer.fa > gaisen.asm.primer.cmscan

cd /data/chaofan/projects/13.Alternaria_Z7/08.gaisen_genome/nextDenovo
cmscan -Z 66 --cut_ga --rfam --nohmmonly --tblout nextDevono.tblout  \
  --fmt 2 --cpu 40 --clanin /data/chaofan/source/04.ncRNA/Rfam.clanin \
  /data/chaofan/source/04.ncRNA/Rfam.cm nextDevono.fa > nextDevono.cmscan

# -Z:根据基因组大小来定,基因组大小的2倍,Mb单位,选一个整数
# –tblout 指定table格式输出文件

4.注释结果整理

保留非重叠区及重叠区最佳 hit(可以不做)

awk 'BEGIN{OFS="\t";}{if(FNR==1) print "target_name\taccession\tquery_name\tquery_start\tquery_end\tstrand\tscore\tEvalue"; if(FNR>2 && $20!="=" && $0!~/^#/) print $2,$3,$4,$10,$11,$12,$17,$18; }' nextDevono.tblout > nextDevono.tblout.xls

注释结果整理为 gff 格式, 有些具体的细节我按照自己喜欢的格式进行生成,也可以进行调整。

python /data/chaofan/scripts/infernal-tblout2gff.py nextDevono.tblout nextDevono.infernal.ncRNA.gff3 /data/chaofan/source/04.ncRNA/RF.txt
python /data/chaofan/scripts/infernal-tblout2gff.py gaisen.asm.primer.tblout gaisen.asm.primer.infernal.ncRNA.gff3 /data/chaofan/source/04.ncRNA/RF.txt

5.所用脚本

infernal-tblout2gff.py
#!/data/chaofan/software/miniconda/bin/python from sys import argv, exit import os if __name__ == '__main__': if len(argv) != 4: exit(f"Usage: python {argv[0]} [infernal-tblout] [ouf_gff] [RF.txt]") ouf_lines = [] if not os.path.exists(argv[1]): exit("Input file not exists!") if not os.path.exists(argv[3]): exit("RF-number info not exists!") RF_dic = {} for line in open(argv[3]): tmp = line.strip().split("\t") if tmp[2].startswith("Gene;"): RF_dic[tmp[0]] = tmp[2].split(';')[1].strip() elif ";" in tmp[2]: RF_dic[tmp[0]] = tmp[2].split(';')[0].strip() else: RF_dic[tmp[0]] = tmp[2] for line in open(argv[1]): if line[0] in ['#', '\n']: continue tmp = line.strip().split() if tmp[19] == '=': continue start_end = list(map(str,sorted([int(tmp[9]),int(tmp[10])]))) t = [tmp[3], "cmscan", RF_dic[tmp[2]], start_end[0], start_end[1], \ tmp[17], tmp[11], '.', "Description="+tmp[1]+":"+"_".join(tmp[26:])] ouf_lines.append(t) ouf_lines = sorted(ouf_lines, key=lambda s:(s[0],int(s[3]))) t_dic = {} # add_id for order in range(len(ouf_lines)): if ouf_lines[order][2] not in t_dic: t_dic[ouf_lines[order][2]] = 0 else: t_dic[ouf_lines[order][2]] += 1 ouf_lines[order][-1] = "ID="+ouf_lines[order][2]+"_"+str(t_dic[ouf_lines[order][2]]).zfill(6)+\ "; "+ouf_lines[order][-1] ouf_w = open(argv[2], 'w') ouf_w.write("\n".join(["\t".join(t_l) for t_l in ouf_lines])) ouf_w.close()

    如发现任何问题,恳请联系我纠正。


文章作者: zhangchaofan
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 zhangchaofan !
评论
  目录