软件安装
#Installing NCBI-blast+ (ftp://ftp.ncbi.nih.gov/blast/executables/blast+/ | https://blast.ncbi.nlm.nih.gov/Blast.cgi)
tar zxf ~/software/rmblast-2.14.0+-x64-linux.tar.gz -C /opt/biosoft
ln -s /opt/biosoft/rmblast-2.14.0 /opt/biosoft/ncbi-blast-2.14.0+
echo 'PATH=$PATH:/opt/biosoft/ncbi-blast-2.14.0+/bin' >> ~/.bashrc
source ~/.bashrc
echo "[BLAST]
BLASTDB=/opt/biosoft/bioinfomatics_databases/ncbi_blastdb" > ~/.ncbirc
# Installing DIAMOND (https://github.com/bbuchfink/diamond)
#wget https://github.com/bbuchfink/diamond/archive/refs/tags/v2.1.8.tar.gz -O ~/software/diamond-2.1.8.tar.gz
#wget https://github.com/bbuchfink/diamond/releases/download/v2.1.8/diamond-linux64.tar.gz -P ~/software/
tar zxf ~/software/diamond-2.1.8.tar.gz && cd diamond-2.1.8
cmake ./ -DCMAKE_INSTALL_PREFIX=/opt/biosoft/diamond-2.1.8 && make -j 4 && make install
cd .. && rm -rf diamond-2.1.8
echo 'PATH=$PATH:/opt/biosoft/diamond-2.1.8/bin' >> ~/.bashrc
source ~/.bashrc
# Installing Nr 数据库子集数据库(数据库的版本是其下载日期)
mkdir /opt/biosoft/bioinfomatics_databases/Nr/
cd /opt/biosoft/bioinfomatics_databases/Nr/
# 下载Nr数据库(FASTA文件)
ascp -T -l 200M -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh --host=ftp.ncbi.nih.gov --user=anonftp --mode=recv /blast/db/FASTA/nr.gz ./
wget https://ftp.ncbi.nih.gov/blast/db/FASTA/nr.gz.md5
# 下载NCBI的分类数据库文件
ascp -T -l 200M -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh --host=ftp.ncbi.nih.gov --user=anonftp --mode=recv /pub/taxonomy/taxdump.tar.gz ./
ascp -T -l 200M -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh --host=ftp.ncbi.nih.gov --user=anonftp --mode=recv /pub/taxonomy/accession2taxid/prot.accession2taxid.gz ./
# 下载并安装NCBI分类数据库解析软件TaxonKit
wget https://github.com/shenwei356/taxonkit/releases/download/v0.2.4/taxonkit_linux_amd64.tar.gz
tar zxvf taxonkit_linux_amd64.tar.gz
#提取想要的指定大类物种序列
mkdir ~/.taxonkit
tar zxf taxdump.tar.gz -C ~/.taxonkit
# 其主要有效文件有两个:
# names.dmp 记录物种名及其分类编号
# nodes.dmp 记录分类编号的节点信息
# 提取古菌(2157)、细菌(2)和病毒(10239)这几个大类对应的所有分类编号。
# 查看~/.taxonkit/names.dmp文件,使用关键词检索得到目标类的分类编号,例如:
# fungi 4751 # grep -P "\|\s+[fF]ungi\w*\s*\|" ~/.taxonkit/names.dmp
# plants 3193 # grep -P "\|\s+[pP]lant\w*\s*\|" ~/.taxonkit/names.dmp
# animals 33208 # grep -P "\|\s+[aA]nimal\w*\s*\|" ~/.taxonkit/names.dmp
# 提取包含古菌(2157)、细菌(2)和病毒(10239)的子集
# 使用taxonkit命令解析nodes.dmp文件的物种节点信息,得到指定类的所有物种列表信息;再编写程序extract_sub_data_from_Nr.pl获得列表中物种在Nr数据库中的序列信息。
./taxonkit list -j 8 --ids 2,2157,10239 > sub.meta.list
./taxonkit list -j 8 --ids 4751 > sub.fungi.list
./taxonkit list -j 8 --ids 3193 > sub.plants.list
./taxonkit list -j 8 --ids 33208 > sub.animals.list
gzip -dc prot.accession2taxid.gz > prot.accession2taxid
gzip -dc nr.gz > nr.fasta
# 提取fungi/plants/animals子集,注意extract_sub_data_from_Nr.pl程序需要消耗约150GB内存。
extract_sub_data_from_Nr.pl --sub_taxon sub.meta.list --acc2taxid prot.accession2taxid nr.fasta > nr_meta.fasta
extract_sub_data_from_Nr.pl --sub_taxon sub.fungi.list --acc2taxid prot.accession2taxid nr.fasta > nr_fungi.fasta
extract_sub_data_from_Nr.pl --sub_taxon sub.plants.list --acc2taxid prot.accession2taxid nr.fasta > nr_plants.fasta
extract_sub_data_from_Nr.pl --sub_taxon sub.animals.list --acc2taxid prot.accession2taxid nr.fasta > nr_animals.fasta
cat nr_animals.fasta nr_plants.fasta nr_fungi.fasta > nr_eukaryon.fasta
# 使用diamond makedb创建数据库
diamond makedb --db nr_meta --in nr_meta.fasta &> nr_meta.log
diamond makedb --db nr_fungi --in nr_fungi.fasta &> nr_fungi.log
diamond makedb --db nr_plants --in nr_plants.fasta &> nr_plants.log
diamond makedb --db nr_animals --in nr_animals.fasta &> nr_animals.log
diamond makedb --db nr_eukaryon --in nr_eukaryon.fasta &> nr_eukaryon.log
# 删除中间文件
/bin/rm -rf *.fasta *.log *.list prot.accession2taxid taxonkit ~/.taxonkit
# Installing Uniprot Swiss-Prot database (https://www.uniprot.org/downloads)
#wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz -P ~/software/
mkdir -p /opt/biosoft/bioinfomatics_databases/Swiss-Prot
cd /opt/biosoft/bioinfomatics_databases/Swiss-Prot
gzip -dc ~/software/uniprot_sprot.fasta.gz > uniprot_sprot.fasta
makeblastdb -in uniprot_sprot.fasta -dbtype prot -title uniprot_sprot -parse_seqids -out uniprot_sprot -logfile uniprot_sprot.log
diamond makedb --db uniprot_sprot --in uniprot_sprot.fasta &> uniprot_sprot.diamond_log
rm uniprot_sprot.fasta
# Installing COG/KOG database (https://www.ncbi.nlm.nih.gov/COG/)
#wget ftp://ftp.ncbi.nih.gov/pub/COG/COG/myva -P ~/software/
#wget ftp://ftp.ncbi.nih.gov/pub/COG/KOG/kyva -P ~/software/
mkdir /opt/biosoft/bioinfomatics_databases/COG
cd /opt/biosoft/bioinfomatics_databases/COG
cp ~/software/COG/* .
ln -s myva cog.fasta
ln -s kyva kog.fasta
diamond makedb --db cog --in cog.fasta &> cog.log
diamond makedb --db kog --in kog.fasta &> kog.log
# Installing eggnog-mapper (https://github.com/eggnogdb/eggnog-mapper | http://eggnogdb.embl.de)
#wget https://github.com/eggnogdb/eggnog-mapper/archive/refs/tags/2.1.11.tar.gz -O ~/software/eggnog-mapper-2.1.11.tar.gz
tar zxf ~/software/eggnog-mapper-2.1.11.tar.gz -C /opt/biosoft
cd /opt/biosoft/eggnog-mapper-2.1.11/
pip3 install psutil xlsxwriter biopython
echo 'PATH=$PATH:/opt/biosoft/eggnog-mapper-2.1.11' >> ~/.bashrc
source ~/.bashrc
# 下载eggNOG数据库后,才能正常使用eggnog-mapper软件
#./download_eggnog_data.py
# Installing eggNOG Eukaryota (taxon ID: 2759) database
#mkdir /opt/biosoft/bioinfomatics_databases/EggNOG
#cd /opt/biosoft/bioinfomatics_databases/EggNOG
#wget http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/2759/2759_hmms.tar
#tar xf 2759_hmms.tar
#cat 2759/*.hmm.gz > EggNOG_Eukaryota.hmm
#hmmpress EggNOG_Eukaryota.hmm
#rm -rf 2759 EggNOG_Eukaryota.hmm
# Installing hmmer (http://hmmer.org/)
#wget http://eddylab.org/software/hmmer/hmmer-3.3.2.tar.gz -P ~/software/
tar zxf ~/software/hmmer-3.3.2.tar.gz
cd hmmer-3.3.2/
./configure --prefix=/opt/biosoft/hmmer-3.3.2 && make -j 4 && make install
cd .. && rm -rf hmmer-3.3.2
echo 'PATH=/opt/biosoft/hmmer-3.3.2/bin/:$PATH' >> ~/.bashrc
source ~/.bashrc
# Installing Pfam database (http://pfam.xfam.org/ | ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam27.0/)
#wget http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz -P ~/software/
gzip -dc ~/software/Pfam-A.hmm.gz > /opt/biosoft/bioinfomatics_databases/Pfam/Pfam-A.hmm
hmmpress /opt/biosoft/bioinfomatics_databases/Pfam/Pfam-A.hmm
rm /opt/biosoft/bioinfomatics_databases/Pfam/Pfam-A.hmm
#wegt ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam27.0/Pfam-A.hmm.gz -O ~/software/Pfam-A_V27.hmm.gz
#wget ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam27.0/Pfam-B.hmm.gz -O ~/software/Pfam-B_V27.hmm.gz
mkdir -p /opt/biosoft/bioinfomatics_databases/Pfam
gzip -dc ~/software/Pfam-A_V27.hmm.gz > /opt/biosoft/bioinfomatics_databases/Pfam/Pfam-AB.hmm
gzip -dc ~/software/Pfam-B_V27.hmm.gz >> /opt/biosoft/bioinfomatics_databases/Pfam/Pfam-AB.hmm
hmmpress /opt/biosoft/bioinfomatics_databases/Pfam/Pfam-AB.hmm
rm /opt/biosoft/bioinfomatics_databases/Pfam/Pfam-AB.hmm
# Installing go_class
tar zxf ~/software/go_class.tar.gz -C /opt/biosoft/
cd /opt/biosoft/go_class/bin/
./make_go_class_config.pl --goslim goslim_agr.obo go-basic.obo
echo 'PATH=$PATH:/opt/biosoft/go_class/bin/' >> ~/.bashrc
source ~/.bashrc
# 安装 dbCAN (http://bcb.unl.edu/dbCAN2/index.php)
#mkdir -p /opt/biosoft/bioinfomatics_databases/dbCAN_V11
#cd /opt/biosoft/bioinfomatics_databases/dbCAN_V11
#wget https://bcb.unl.edu/dbCAN2/download/Databases/V11/dbCAN-HMMdb-V11.txt
#wget https://bcb.unl.edu/dbCAN2/download/Databases/V11/CAZyDB.08062022.fa
#wget https://bcb.unl.edu/dbCAN2/download/Databases/V11/hmmscan-parser.sh
#wget https://bcb.unl.edu/dbCAN2/download/Databases/V11/readme.txt
tar zxf ~/software/dbCAN_V11.tar.gz -C /opt/biosoft/bioinfomatics_databases/
cd /opt/biosoft/bioinfomatics_databases/dbCAN_v11
ln -s dbCAN-HMMdb-V11.txt dbCAN-HMMdb
hmmpress dbCAN-HMMdb
ln -s CAZyDB.08062022.fa CAZyDB.fa
diamond makedb --in CAZyDB.fa --db CAZyDB
########## 计算阈值 ##########
#split_fasta_file_averagely.pl CAZyDB.fa 10
#for i in `ls ??.fasta`
#do
# x=${i/.fasta/}
# echo "diamond blastp --db CAZyDB --query $i --out $x.xml --outfmt 5 --sensitive --max-target-seqs 501 --evalue 1e-5 --id 10 --block-size 20.0 --tmpdir /dev/shm --index-chunks 1; parsing_blast_result.pl --max-hit-num 500 $x.xml > $x.tab"
#done > command.diamond.list
#ParaFly -c command.diamond.list -CPU 1
#perl -e 'while (<>) { if (m/^(\S+)\t(\S+)/ && $1 eq $2) { next } else { print } }' ??.tab > diamond.tab
#
#para_hmmscan.pl --CPU 80 --no_cut_ga --hmm_db dbCAN-HMMdb CAZyDB.fa > hmmscan.domtbl
#dbcan_combine.pl --threshold_fractile 0.5 --out_prefix out50 --query CAZyDB.fa --CAZy_blastDB CAZyDB.fa hmmscan.domtbl diamond.tab
#
#rm -rf *.fasta *.xml command* hmmscan* ?.tab
##############################
# Installing SignalP (http://www.cbs.dtu.dk/services/SignalP/ | http://www.cbs.dtu.dk/services/software.php)
# 需要填写edu邮箱和相关信息来获取下载地址
#wget http://www.cbs.dtu.dk/download/6B91F6BC-5A05-11E9-8172-2ED6B9CD16B5/signalp-5.0.Linux.tar.gz -P ~/software/
tar zxf /home/train/software/signalp-5.0.Linux.tar.gz -C /opt/biosoft/
echo 'PATH=$PATH:/opt/biosoft/signalp-5.0/bin' >> ~/.bashrc
source ~/.bashrc
# Installing TMHMM (http://www.cbs.dtu.dk/services/TMHMM/)
# 需要填写edu邮箱和相关信息来获取下载地址
#wget http://www.cbs.dtu.dk/download/2435F062-5A0E-11E9-8117-6AA2B9CD16B5/tmhmm-2.0c.Linux.tar.gz -P ~/software/
tar zxf /home/train/software/tmhmm-2.0c.Linux.tar.gz -C /opt/biosoft/
perl -p -i -e 's#/usr/local/bin/perl#/usr/bin/perl#' /opt/biosoft/tmhmm-2.0c/bin/tmhmm*
echo 'PATH=$PATH:/opt/biosoft/tmhmm-2.0c/bin/' >> ~/.bashrc
source ~/.bashrc
# 1. NR 注释(基于序列相似性)
# 本地化基因组数据库构建和wwwblast搭建
mkdir /opt/biosoft/bioinfomatics_databases/ncbi_blastdb
cd /opt/biosoft/bioinfomatics_databases/ncbi_blastdb
# 构建核酸数据库
makeblastdb -in ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta -dbtype nucl -title Malassezia_sympodialis_V01.genome -parse_seqids -out Malassezia_sympodialis_V01.genome -logfile Malassezia_sympodialis_V01.genome.log
# 构建蛋白质数据库
makeblastdb -in ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/Malassezia_sympodialis_V01.protein.fasta -dbtype prot -title Malassezia_sympodialis_V01.protein -parse_seqids -out Malassezia_sympodialis_V01.protein -logfile protein.log
# 本地化运行 blast 程序
mkdir -p ~/13.functional_annotation/00.blast_to_genome
cd ~/13.functional_annotation/00.blast_to_genome
# 在https://www.ncbi.nlm.nih.gov/gene/数据库搜索"3-phytase a",点击Send to,选择File,在Format中选择UI List,下载文件保存为gi.ncbi_gene_phyA.list
cp ~/00.incipient_data/data_for_functional_annotation/gi.ncbi_gene_phyA.list ./
#安装模块sudo cpan -i Bio::DB::GenBank
ncbi_acc_or_gi_to_fasta.pl gi.ncbi_gene_phyA.list 10 > phyA.nucl.fasta#通过UI List的gi编号去ncbi转化为fasta序列,失败一般是网络的问题
#由于下载速度太慢,cp示例数据,并进行修改id
[train@MiWiFi-R3P-srv 00.blast_to_genome]$ grep ">" ~/00.incipient_data/data_for_functional_annotation/phyA.nucl.fasta | head
>35405752 | AACN010663228 | Canis lupus familiaris breed poodle ctg19866850009989, whole genome shotgun sequence.
>36555922 | AACN011019018 | Canis lupus familiaris breed poodle ctg19866851713295, whole genome shotgun sequence.
>19247313 | BJ338951 | BJ338951 Dictyostelium discoideum cDNA library, AF Dictyostelium discoideum cDNA clone dda63i18 5', mRNA sequence.
>20376086 | BM401458 | JH1D04F Snake Bothrops insularis library IL3 Bothrops insularis cDNA 5' similar to Snake venom metalloproteinase, mRNA sequence.
>31002700 | CD242236 | AGENCOURT_14122406 NIH_MGC_187 Homo sapiens cDNA clone IMAGE:30382204 5', mRNA sequence.
>39616953 | CG917667 | ZMMBBb0388F15.r ZMMBBb Zea mays subsp. mays genomic clone ZMMBBb0388F15 3', genomic survey sequence.
>25974636 | BU481059 | 603845202F1 CSEQRBN22 Gallus gallus cDNA clone ChEST833c1 5', mRNA sequence.
>12986094 | AZ816186 | 2M0084M15R Mouse 10kb plasmid UUGC1M library Mus musculus genomic clone UUGC2M0084M15 R, genomic survey sequence.
>28728610 | AABR02134075 | Rattus norvegicus strain BN/SsNHsdMCW chromosome 7 clone CH230-342M6; CH230-149B18 RNOR01119317, whole genome shotgun sequence.
>5444956 | AI824285 | wj08b12.x1 NCI_CGAP_Kid12 Homo sapiens cDNA clone IMAGE:2402207 3' similar to TR:O13642 O13642 LIPOIC ACID SYNTHETASE.; contains Alu repetitive element, mRNA sequence.
perl -p -e 's/>(\S+).*/>$1/; s/\|/_/g' ~/00.incipient_data/data_for_functional_annotation/phyA.nucl.fasta > phyA.nucl.fasta
[train@MiWiFi-R3P-srv 00.blast_to_genome]$ grep ">" phyA.nucl.fasta | head
>35405752
>36555922
>19247313
>20376086
>31002700
>39616953
>25974636
>12986094
>28728610
>5444956
[train@MiWiFi-R3P-srv 00.blast_to_genome]$ cal_seq_length.pl phyA.pep.fasta
sp_O00092_PHYA_ASPFU 465
sp_P34752_PHYA_ASPNG 467
sp_O00100_PHYA2_ASPTE 466
sp_O00085_PHYA1_ASPTE 466
sp_O00107_PHYA_THIHE 487
sp_P34753_PHYA_ASPAW 467
sp_Q9C1T1_PHYA_ASPOR 466
sp_Q0CLV1_PHYA_ASPTN 466
sp_D4ANW6_PHYA_ARTBC 456
blastn -query phyA.nucl.fasta -db Malassezia_sympodialis_V01.genome -out blastn.out -evalue 1e-5 -outfmt 7 -num_threads 4
#由于不用物种核酸序列比对结果比较差,需要使用蛋白序列进行比对更加准确
# 在https://www.uniprot.org/uniprot/搜索"3-phytase a" AND reviewed:yes,并下载数据
[train@MiWiFi-R3P-srv 00.blast_to_genome]$ grep ">" ~/00.incipient_data/data_for_functional_annotation/uniprot-_3-phytase+a_-filtered-reviewed_yes.fasta
>sp|O00092|PHYA_ASPFU 3-phytase A OS=Neosartorya fumigata (strain ATCC MYA-4609 / Af293 / CBS 101355 / FGSC A1100) OX=330879 GN=phyA PE=1 SV=1
>sp|P34752|PHYA_ASPNG 3-phytase A OS=Aspergillus niger OX=5061 GN=phyA PE=1 SV=1
>sp|O00100|PHYA2_ASPTE 3-phytase A OS=Aspergillus terreus OX=33178 GN=phyA PE=1 SV=1
>sp|O00085|PHYA1_ASPTE 3-phytase A OS=Aspergillus terreus OX=33178 GN=phyA PE=1 SV=1
>sp|O00107|PHYA_THIHE 3-phytase A OS=Thielavia heterothallica OX=78579 GN=PHYA PE=1 SV=1
>sp|P34753|PHYA_ASPAW 3-phytase A OS=Aspergillus awamori OX=105351 GN=phyA PE=3 SV=1
>sp|Q9C1T1|PHYA_ASPOR 3-phytase A OS=Aspergillus oryzae (strain ATCC 42149 / RIB 40) OX=510516 GN=phyA PE=2 SV=1
>sp|Q0CLV1|PHYA_ASPTN 3-phytase A OS=Aspergillus terreus (strain NIH 2624 / FGSC A1156) OX=341663 GN=phyA PE=3 SV=1
>sp|D4ANW6|PHYA_ARTBC 3-phytase A OS=Arthroderma benhamiae (strain ATCC MYA-4681 / CBS 112371) OX=663331 GN=ARB_05933 PE=1 SV=1
perl -p -e 's/>(\S+).*/>$1/; s/\|/_/g' ~/00.incipient_data/data_for_functional_annotation/uniprot-_3-phytase+a_-filtered-reviewed_yes.fasta > phyA.pep.fasta
[train@MiWiFi-R3P-srv 00.blast_to_genome]$ grep ">" phyA.pep.fasta
>sp_O00092_PHYA_ASPFU
>sp_P34752_PHYA_ASPNG
>sp_O00100_PHYA2_ASPTE
>sp_O00085_PHYA1_ASPTE
>sp_O00107_PHYA_THIHE
>sp_P34753_PHYA_ASPAW
>sp_Q9C1T1_PHYA_ASPOR
>sp_Q0CLV1_PHYA_ASPTN
>sp_D4ANW6_PHYA_ARTBC
blastp -query phyA.pep.fasta -db Malassezia_sympodialis_V01.protein -out blastp.out -evalue 1e-5 -outfmt 7 -num_threads 4 #建议 -outfmt 5
#最终结果应该同时考虑覆盖率、e-value和identity(尽量大于20%),但是ncbi-blast软件不提供覆盖率计算,需要编写程序计算覆盖率,本次培训提供实用脚本parsing_blast_result.pl
# 准备基因组蛋白质序列文件
mkdir -p ~/13.functional_annotation
cd ~/13.functional_annotation
perl -p -e 's/\*$//; s/^>(\w+).*/>$1/' ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/Malassezia_sympodialis_V01.protein.fasta > proteins.fasta
# 对基因组蛋白质序列进行Nr注释(目前最大的数据库)
mkdir -p /home/train/13.functional_annotation/01.Nr
cd /home/train/13.functional_annotation/01.Nr
#ln -sf /opt/biosoft/bioinfomatics_databases/Nr/nr_fungi.dmnd ./
#diamond blastp --db nr_fungi --query ../proteins.fasta --out Nr.xml --outfmt 5 --sensitive --max-target-seqs 20 --evalue 1e-5 --id 10 --index-chunks 1 #diamond软件较ncbi-blast速度快的多;--outfmt 5 --sensitive方便计算mapiing,--max-target-seqs 20最多有20个比对结果,默认500;--id 10 identity大于10
# 160线程2.4G Hz CPU下计算
# real 12m57.783s
# user 713m2.268s
# sys 244m35.499s
gzip -dc ~/00.incipient_data/data_for_functional_annotation/Nr.xml.gz > Nr.xml
处理blast比对结果
parsing_blast_result.pl --out-hit-confidence --suject-annotation Nr.xml > Nr.tab#过滤结果
[train@MiWiFi-R3P-srv Nr]$ grep "^#" Nr.tab
#QueryID SubjectID Identity MatchLength MismatchLength Gaps QueryStart QueryEnd SubjectStart SubjectEnd Evalue BitScore CIP#subject mapping CALP_query#qury mapping CALP_subject SubjectAnnotation
[train@MiWiFi-R3P-srv Nr]$ cut -f 1 Nr.tab | uniq |wc -l#统计比对到的基因数量
4171
nr_species_distribution.pl Nr.tab > Nr_species_distribution.txt
gene_annotation_from_Nr.pl Nr.tab > Nr.txt #将注释信息提取出来,一个基因只提取一个注释结果(出现次数最多的结果)
[train@MiWiFi-R3P-srv Nr]$ gene_annotation_from_Nr.pl
Usage:
/home/train/bin/gene_annotation_from_Nr.pl [options] Nr.tab > Nr.txt
程序用于提取基因的Nr注释结果。Blast分析后的Nr注释结果中含有多种结果,本程序提取得到基因最可能的注释结果。
程序输入blast outfmt 6表格形式的注释结果,分析输入文件中最后一列表示SubjectAnnotation的描述结果,获得基因的注释结果。其结果信息分两列,第一列基因ID,第二列注释的蛋白名称。每个基因一行,若基因有多个可能的注释结果,以分号分割。
--desc-identity <float> default: 0.4
若两个注释字符串描述之间的相似性不低于该阈值,则认为是同一个注释结果。
一个基因有多个Hit,不同的Hit有不同的描述性信息。多个描数性信息之间可能对应同一种蛋白名称,而其字符串不一致。此时,需要将多种描述合并为一种。程序根据其字符串特性,若两种不同描述的字符串的identity不小于该阈值,则认为是同一种描述,并使用最先出现的描述名称。例如: "Cytochrome b reductase 1" 和 "Cytochrome b561" 其实是同一种描述。像这种情况,设置较大的阈值,则可能认为它们是不同的描述了。
两个字符串进行identity分析的方法:将两个字符串分别使用空打断后,在切割成长度为3个字符的mer,在比较相同mer的比例,得到identity值。两个字符串分别进行分析,得到两个identity值,取其算术平均值作为最终的idnetity值。
--hit-ratio <float> default: 0.3
--hit-num <int> default: 2
一个基因有多种注释。若某种注释的比例不低于--hit-ratio阈值,且注释的数量不低于--hit-num阈值,则给出到最终的注释结果中。否则,仅给出evalue最优的结果。
--out-num <int> default: max
在前两个参数的基础上,再设置最多输出指定数目的注释条目数。不添加该参数,则输出所有可能的注释结果,多个注释之间使用分号分割。
[train@MiWiFi-R3P-srv Nr]$ head Nr.txt
MS01Gene000001 hypothetical protein Malapachy_3516
MS01Gene000002 Similar to S.cerevisiae protein MCM6 (Protein involved in DNA replication)
MS01Gene000003 hypothetical protein MGL_2122; Similar to S.cerevisiae protein RRP7 (Essential protein involved in rRNA processing and ribosome biogenesis)
MS01Gene000004 hypothetical protein Malapachy_3517; Similar to S.cerevisiae protein RSM24 (Mitochondrial ribosomal protein of the small subunit)
MS01Gene000005 hypothetical protein Malapachy_3533
MS01Gene000006 tim50-mitochondrial inner membrane import translocase subunit
MS01Gene000007 hypothetical protein MGL_2118; 2-hydroxyacid dehydrogenase
MS01Gene000008 Putative peptidyl-tRNA hydrolase PTRHD1; Hypothetical protein MSYG_0013
MS01Gene000009 transport protein particle complex subunit; hypothetical protein MGL_2116
MS01Gene000010 hypothetical protein MGL_2115
cp Nr.tab ../functional_annotation.Nr.tab
cp Nr.txt ../functional_annotation.Nr.txt
cp Nr_species_distribution.txt ../functional_annotation.Nr.species_distribution
cd ..
# 2. 进行 Swiss-Prot 注释(基于序列相似性)
mkdir -p /home/train/13.functional_annotation/02.Swiss-Prot
cd /home/train/13.functional_annotation/02.Swiss-Prot
# 使用 ncbi-blast+ 进行 Swiss-Prot 注释
head -n 200 ../proteins.fasta > test.fasta
~/bin/blast.pl --CPU 8 --outfmt 5 -clean /opt/biosoft/bioinfomatics_databases/Swiss-Prot/uniprot_sprot test.fasta > blast.xml
# real 0m47.846s
# user 4m53.512s
# sys 0m0.752s
# 使用diamond进行分析
ln -sf /opt/biosoft/bioinfomatics_databases/Swiss-Prot/uniprot_sprot.dmnd ./
diamond blastp --db uniprot_sprot --query ../proteins.fasta --out uniprot_sprot.xml --outfmt 5 --sensitive --max-target-seqs 20 --evalue 1e-5 --id 10 --index-chunks 1
# real 0m50.636s
# user 6m29.161s
# sys 0m1.372s
parsing_blast_result.pl --out-hit-confidence --suject-annotation uniprot_sprot.xml > uniprot_sprot.tab
gene_annotation_from_SwissProt.pl uniprot_sprot.tab > SwissProt.txt# #将注释信息提取出来,一个基因只提取一个注释结果(出现次数最多的结果),类似gene_annotation_from_Nr.pl脚本
cp uniprot_sprot.tab ../functional_annotation.Swiss-Prot.tab
cp SwissProt.txt ../functional_annotation.Swiss-Prot.txt
cd ..
# 3. 进行 COG / KOG 注释(基于序列相似性)
mkdir -p /home/train/13.functional_annotation/03.COG
cd /home/train/13.functional_annotation/03.COG
ln -sf /opt/biosoft/bioinfomatics_databases/COG/kog.dmnd ./
diamond blastp --db kog --query ../proteins.fasta --out kog.xml --outfmt 5 --sensitive --max-target-seqs 200 --evalue 1e-5 --id 10 --tmpdir /dev/shm --index-chunks 1
# real 0m14.567s
# user 1m52.251s
# sys 0m0.483s
cog_from_xml.pl --coverage 0.2 --evalue 1e-5 --db-fasta /opt/biosoft/bioinfomatics_databases/COG/kog.fasta --db-class /opt/biosoft/bioinfomatics_databases/COG/kog --fun-txt /opt/biosoft/bioinfomatics_databases/COG/fun.kog.txt kog.xml
cut -f 1,3,4 out.annot | gene_annotation_from_table.pl - > KOG.txt
cog_R.pl --title "KOG Function Classification of Whole Genome Genes of Malassezia sympodialis" --y-name "Number of Genes" out.class #该脚本来自于华大基因
cp out.annot ../functional_annotation.KOG.tab
cp KOG.txt ../functional_annotation.KOG.txt
cp out.pdf ../functional_annotation.KOG.pdf
cp out.png ../functional_annotation.KOG.png
cd ..
# 4. eggNOG 注释(基于序列相似性)
mkdir -p /home/train/13.functional_annotation/04.eggNOG
cd /home/train/13.functional_annotation/04.eggNOG
# eggNOG官网的网页工具提交进行注释
# http://eggnog-mapper.embl.de/提交蛋白序列
# one-to-one orthology only; evalue: 0.001; score 60; query coverage: 20%; subject coverage: 20%
# http://eggnog-mapper.embl.de/job_status?jobname=MM_1revx_hu
# 耗时2h19min
# wget http://eggnog-mapper.embl.de/MM_1revx_hu/query_seqs.fa.emapper.annotations
# wget http://eggnog-mapper.embl.de/MM_1revx_hu/query_seqs.fa.emapper.seed_orthologs
# 使用本地eggNOG数据库和eggnog-mapper软件进行注释
#ln -sf /opt/biosoft/eggnog-mapper-2.1.11/data/eggnog_proteins.dmnd ./
#diamond blastp --db eggnog_proteins --query ../proteins.fasta --out eggNOG.xml --outfmt 5 --sensitive --max-target-seqs 20 --evalue 1e-5 --id 10 --tmpdir /dev/shm --index-chunks 1
#parsing_blast_result.pl --no-header --max-hit-num 1 --HSP-num 1 eggNOG.xml | cut -f 1,2,11,12 > eggNOG.emapper.seed_orthologs
#/opt/biosoft/eggnog-mapper-2.1.11/emapper.py -m no_search --annotate_hits_table eggNOG.emapper.seed_orthologs -o eggNOG --dbmem --override
cp ~/00.incipient_data/data_for_functional_annotation/eggNOG.emapper* ./
ln -sf eggNOG.emapper.annotations eggNOG.annot
cp eggNOG.annot ../functional_annotation.eggNOG.tab
grep -v -P "^#" eggNOG.emapper.annotations | cut -f 1,8 > eggNOG.txt
cp eggNOG.txt ../functional_annotation.eggNOG.txt
cd ..
# 5. Interpro 注释 (基于结构域)
mkdir -p /home/train/13.functional_annotation/05.InterPro
cd /home/train/13.functional_annotation/05.InterPro
# 联网提交序列到EBI官网进行InterPro注释
#interProScan5.pl ../proteins.fasta chenllianfu@gmail.com interpro5 30
# 本地InterPro注释,可以或得TSV、XML、GFF3、SVG和HTML结果。新版的InterPro开始不支持SVG和HTML结果,网页提交完全不支持SVG和HTML结果。
#export PATH=/opt/sysoft/jdk-20.0.1/bin:$PATH
#/opt/biosoft/interproscan/interproscan.sh --output-file-base out --cpu 8 --formats TSV,XML,GFF3 --goterms --input uncompleted2.fasta
cp ~/00.incipient_data/data_for_functional_annotation/interpro* ./
grep IPR interpro.tsv | cut -f 1,12,13 | gene_annotation_from_table.pl - > InterPro.txt
cp interpro.tsv ../functional_annotation.InterPro.tab
cp InterPro.txt ../functional_annotation.InterPro.txt
cp interpro.gff3 ../functional_annotation.InterPro.gff3
cp interpro.html.tar.gz ../functional_annotation.InterPro.html.tar.gz
cp interpro.svg.tar.gz ../functional_annotation.InterPro.svg.tar.gz
cd ..
# 6. Pfam 注释(基于结构域,可以使用pfam编号开展基因家族分析)
mkdir -p /home/train/13.functional_annotation/06.Pfam
cd /home/train/13.functional_annotation/06.Pfam
#para_hmmscan.pl --outformat --cpu 4 --hmm_db /opt/biosoft/bioinfomatics_databases/Pfam/Pfam-A.hmm ../proteins.fasta > Pfam.tab
cp ~/00.incipient_data/data_for_functional_annotation/Pfam.tab ./
cut -f 1,2,7 Pfam.tab | perl -e '<>; print <>' | gene_annotation_from_table.pl - > Pfam.txt
cp Pfam.tab ../functional_annotation.Pfam.tab
cp Pfam.txt ../functional_annotation.Pfam.txt
cd ..
# 7. GO 注释
mkdir -p /home/train/13.functional_annotation/07.GO
cd /home/train/13.functional_annotation/07.GO
# 整合eggNOG和InterPro中的GO注释结果
go_from_eggNOG_and_interpro.pl ../04.eggNOG/eggNOG.emapper.annotations ../05.InterPro/interpro.tsv > go.annot
go_reducing_go_number_para.pl /opt/biosoft/go_class/bin/go-basic.obo go.annot 8 > go_reduced.annot #减少go条目数目,比较耗计算
sort go_reduced.annot > go.annot; rm go_reduced.annot
gene_annotation_from_table.pl go.annot > GO.txt #
# GO wego分类图(按照goslim_agr.obo方法分了53类)(重点关注某些go编号,如agr:表示农业)来自华大
[train@MiWiFi-R3P-srv 07.GO]$ ls /opt/biosoft/go_class/bin/
annot2gaf.pl changsvgsize.pl get_Genes_From_GO.pl go-basic.obo go.obo goslim_agr.obo goslim_pir.obo go_svg.pl ontologizer2enriched_GOs_and_GeneID.pl
annot2wego.pl config.txt go.alias go.class go_reduced.annot goslim_generic.obo goslim_plant.obo make_go_class_config.pl
/opt/biosoft/go_class/bin/annot2wego.pl go.annot > go.wego
get_Genes_From_GO.pl /opt/biosoft/go_class/bin/go-basic.obo go.wego > go_class.tab
go_svg.pl --outdir ./ --name out --color "green" --mark "Whole Genome Genes" --note "GO Class of whole genome genes of Malassezia sympodialis" go.wego
perl -p -i -e 's/MovePer:0.125/MovePer:0.5/; s/FontSize:.*/FontSize:24/;' out.lst
/opt/biosoft/go_class/svg/distributing_svg.pl out.lst out.svg
/opt/biosoft/go_class/bin/changsvgsize.pl out.svg 150 -100
convert out.svg out.png #该脚本来自华大
# GO分类(按照goslim_agr.obo方法分了53类)
go_class.pl /opt/biosoft/go_class/bin/go-basic.obo go.annot --go_class_method /opt/biosoft/go_class/bin/goslim_agr.obo > go_class.tab
cp go.annot ../functional_annotation.GO.tab
cp GO.txt ../functional_annotation.GO.txt
cp out.svg ../functional_annotation.GO_class.svg
cp out.png ../functional_annotation.GO_class.png
cp go_class.tab ../functional_annotation.GO_class.tab
cd ..
# 8. KAAS 注释 (KEGG注释)在网页上进行,也可以从eggNOG的结果中进行提取
mkdir -p /home/train/13.functional_annotation/08.KAAS
cd /home/train/13.functional_annotation/08.KAAS
# 在 http://www.genome.jp/kaas-bin/kaas_main 网页工具中提交序列进行注释。需要填写邮箱信息,在网页中提交后需要再进入邮箱,点击邮件中的提交链接,才能开始计算。
# Selected organisms: hsa, mmu, rno, dre, dme, cel, ath, sce, cal, spo, ecu, pfa, cho, ehi, eco, nme, hpy, bsu, lla, mge, mtu, syn, aae, mja, ape, sce, dha, ncr, fox, ssl, afm, cpw, bze, tml, uma, mrt, cgi, hir
# 注释完毕后,从填写的邮箱中进入注释完毕后的网页,下载注释结果 query.ko 文件。
#https://www.genome.jp/kaas-bin/kaas_main?mode=user&id=1626238738&key=GdCfKFob
#wget https://www.genome.jp/tools/kaas/files/dl/1689179635/query.ko
cp ~/00.incipient_data/data_for_functional_annotation/query.ko ./
cp ~/00.incipient_data/data_for_functional_annotation/ko00001.keg ./
gene_annotation_from_kaas.pl query.ko > KEGG.txt
cp KEGG.txt ../functional_annotation.KEGG.txt
cd ..
# 9. 整合各种注释结果到一个表格中
gene_annotation_from_all.pl --all-protein proteins.fasta --html --table-width 3500 --column-width-geneID 150 01.Nr/Nr.txt 02.Swiss-Prot/SwissProt.txt 03.COG/KOG.txt 04.eggNOG/eggNOG.txt 05.InterPro/InterPro.txt 06.Pfam/Pfam.txt 07.GO/GO.txt 08.KAAS/KEGG.txt > functional_annotation.All.html
gene_annotation_from_all.pl --all-protein proteins.fasta 01.Nr/Nr.txt 02.Swiss-Prot/SwissProt.txt 03.COG/KOG.txt 04.eggNOG/eggNOG.txt 05.InterPro/InterPro.txt 06.Pfam/Pfam.txt 07.GO/GO.txt 08.KAAS/KEGG.txt > functional_annotation.All.tab 2> functional_annotation.All.stats
# 10. GO 和 KEGG 富集分析
mkdir /home/train/13.functional_annotation/09.GO_enrichment
cd /home/train/13.functional_annotation/09.GO_enrichment
ln -s /opt/biosoft/go_class/bin/go-basic.obo ./
cut -f 1,2 ~/00.incipient_data/data_for_functional_annotation/functional_annotation.GO.tab > go.annot
cp ~/00.incipient_data/data_for_functional_annotation/DEG.up.list ./
cp ~/00.incipient_data/data_for_functional_annotation/DEG.down.list ./
#perl -e '<>; while (<>) {@_ = split /\t/; print "$_[0]\t$_[2]\n";}' /home/train/09.RNA-seq_analysis_by_cufflinks/cuffnorm/DEG.S2_VS_S4.S4-Up.tab > DEG.up.list
#perl -e '<>; while (<>) {@_ = split /\t/; print "$_[0]\t$_[2]\n";}' /home/train/09.RNA-seq_analysis_by_cufflinks/cuffnorm/DEG.S2_VS_S4.S4-Down.tab > DEG.down.list
go_enrichment.pl --tmp_prefix go_enrichment_up go-basic.obo go.annot DEG.up.list > go_enrichment.up.txt
go_enrichment.pl --tmp_prefix go_enrichment_down go-basic.obo go.annot DEG.down.list > go_enrichment.down.txt #go.annot 全基因组的注释文件
rm go_enrichment_*
cd ..
mkdir /home/train/13.functional_annotation/10.KEGG_enrichment
cd /home/train/13.functional_annotation/10.KEGG_enrichment
cp ~/00.incipient_data/data_for_functional_annotation/ko00001.keg ./
cp ../08.KAAS/query.ko ./
cp ~/00.incipient_data/data_for_functional_annotation/DEG.up.list ./
cp ~/00.incipient_data/data_for_functional_annotation/DEG.down.list ./
pathview.pl --cpu 10 --kokeg ko00001.keg query.ko DEG.up.list DEG.down.list
cd ..
# 11. 真菌的 Transcription Fatcor 注释
mkdir -p /home/train/13.functional_annotation/11.TF
cd /home/train/13.functional_annotation/11.TF
interpro2tf_for_Fungi.pl ~/00.incipient_data/data_for_functional_annotation/functional_annotation.InterPro.tab --out_prefix TF
cd ..
# 12. CAZY annotation (动植物不做)
mkdir -p /home/train/13.functional_annotation/12.CAZyme
cd /home/train/13.functional_annotation/12.CAZyme
# 使用 HMM 方法进行注释
para_hmmscan.pl --no_cut_ga --cpu 4 --hmm_db /opt/biosoft/bioinfomatics_databases/dbCAN_V11/dbCAN-HMMdb ../proteins.fasta > hmmscan.domtbl
# real 0m24.270s
# user 1m10.534s
# sys 1m40.529s
# 使用 Blastp 方法进行注释
ln -s /opt/biosoft/bioinfomatics_databases/dbCAN_V11/CAZyDB.dmnd ./
diamond blastp --db CAZyDB --query ../proteins.fasta --out diamond.xml --outfmt 5 --sensitive --max-target-seqs 500 --evalue 1e-5 --id 20 --tmpdir /dev/shm --index-chunks 1
# real 1m48.956s
# user 13m57.829s
# sys 0m2.708s
parsing_blast_result.pl --no-header --evalue 1e-5 --HSP-num 1 diamond.xml > blastp.outfmt6
# 合并两者结果
dbcan_combine.pl --query ../proteins.fasta --CAZy_blastDB /opt/biosoft/bioinfomatics_databases/dbCAN_V11/CAZyDB.fa hmmscan.domtbl blastp.outfmt6
#dbcan_combine.pl --query ../proteins.fasta --CAZy_blastDB /opt/biosoft/bioinfomatics_databases/dbCAN_V11/CAZyDB.fa --threshold_file_in /opt/biosoft/bioinfomatics_databases/dbCAN_v9.0/out50.threshold.tab hmmscan.domtbl blastp.outfmt6
cd ..
# 13. secreted protein annotation
mkdir -p /home/train/13.functional_annotation/13.secreted_protein
cd /home/train/13.functional_annotation/13.secreted_protein
# 首先,进行信号肽分析。分泌蛋白都具有信号肽。
mkdir singalp
cd singalp
/opt/biosoft/signalp-5.0/bin/signalp -batch 30000 -org euk -fasta ../../proteins.fasta -gff3 -mature
# 193 个蛋白具有信号肽
cd ..
# 再进行跨膜区分析。若具有跨膜区,则蛋白会和膜进行结合,从而固定到膜上,不会成为分泌蛋白。
mkdir TMHMM
cd TMHMM
/opt/biosoft/tmhmm-2.0c/bin/tmhmm ../singalp/proteins_mature.fasta > tmhmm.out
grep "Number of predicted TMHs: 0" tmhmm.out | perl -p -e 's/#\s+(\S+).*/$1/' > genes_without_TMHs.list
fasta_extract_subseqs_from_list.pl ../../proteins.fasta genes_without_TMHs.list > ../candidate_secreted_proteins.fasta
# 141 个蛋白没有跨膜区
cd ..
# 再分析GPI锚定位点。GPI锚定蛋白和膜结合,从而固定到膜上,不会成为分泌蛋白。
mkdir PredGPI
cd PredGPI
# split_fasta_file_averagely.pl ../candidate_secreted_proteins.fasta 2
# https://busca.biocomp.unibo.it/predgpi
# 一次最多允许提交100条序列。提交后点击download。
# 结果文件是fasta格式文件,其头部包含结果信息。
# PredGPI.fasta
# 取阈值FDR 0.5%。
# FDR <= 0.1% GPI-anchored: highly probable
# FDR <= 0.5% GPI-anchored: probable
# FDR <= 1.0% GPI-anchored: lowly probable
# https://busca.biocomp.unibo.it/predgpi/687d287b-5d54-4f3d-bc6c-4771a686269a/showresult/
# https://busca.biocomp.unibo.it/predgpi/d9a2a9d6-39f1-464e-8c56-ddda1001e33a/showresult/
cp ~/00.incipient_data/data_for_functional_annotation/GPIPE_query_results__tmp_tmp0WKt1s.txt PredGPI.fasta
perl -e 'while (<>) { if (m/^>(\S+).*FPrate:(\S+)/ && $2 <= 0.01) { print "$1\n"; } }' PredGPI.fasta > GPI_gene.list
# 31个基因有GPI锚定位点
fasta_extract_subseqs_from_list.pl --reverse ../candidate_secreted_proteins.fasta GPI_gene.list > ../candidate_secreted_proteins.NO_GPI.fasta
cd ..
# 最后,进行亚细胞定位,选取定位到胞外的蛋白作为分泌蛋白基因
mkdir BUSCA
cd BUSCA
# http://busca.biocomp.unibo.it/
# 一次最多允许提交500条序列。选择Fungi类型,点击Start prediction。
# http://busca.biocomp.unibo.it/240af241-5353-4ad0-bd46-1794f4cdb8a1/showresult/
# 下载表格格式结果:BUSCA_JOB_240af241-5353-4ad0-bd46-1794f4cdb8a1.csv
cp ~/00.incipient_data/data_for_functional_annotation/BUSCA_JOB_240af241-5353-4ad0-bd46-1794f4cdb8a1.csv BUSCA.out.csv
perl -e '<>; while (<>) { @_ = split /,/; $stats{$_[2]}{$_[0]} = 1; } foreach (sort keys %stats) { @gene = sort keys %{$stats{$_}}; my $gene_number = 0; $gene_number = @gene; print STDERR "$_\t$gene_number\n"; if ($_ eq "C:extracellular space") { foreach (@gene) { print "$_\n"; } } }' BUSCA.out.csv > extracellular_gene.list 2> BUSCA.out.csv.stats
# 118个蛋白定位于胞外
fasta_extract_subseqs_from_list.pl ../candidate_secreted_proteins.NO_GPI.fasta extracellular_gene.list > ../candidate_secreted_proteins.NO_GPI.extracellular.fasta
cd ..
ln -s candidate_secreted_proteins.NO_GPI.extracellular.fasta secreted_proteins.fasta
# 分泌蛋白最终结果是 secreted_proteins.fasta
cd ..
tar tzf file.gz #查看压缩包的内容,不解压
[train@MiWiFi-R3P-srv 13.functional_annotation]$ ll
total 109344
drwxr-xr-x 3 train train 4096 Jul 24 10:23 00.blast_to_genome
drwxr-xr-x 2 train train 34 Jul 24 17:02 01.Nr
drwxr-xr-x 2 train train 138 Jul 24 15:37 02.Swiss-Prot
drwxr-xr-x 2 train train 174 Jul 24 14:45 03.COG
drwxr-xr-x 2 train train 115 Jul 24 14:54 04.eggNOG #COG的扩大版本
drwxr-xr-x 2 train train 122 Jul 24 15:23 05.InterPro #基于结构域进行比对
drwxr-xr-x 2 train train 38 Jul 24 17:05 06.Pfam #同源基因预测
drwxr-xr-x 2 train train 167 Jul 24 16:41 07.GO #可以对感兴趣的go条目进行注释
drwxr-xr-x 2 train train 57 Jul 24 16:51 08.KAAS #KEGG
drwxr-xr-x 2 train train 4096 Jul 24 17:41 09.GO_enrichment
drwxr-xr-x 5 train train 4096 Jul 24 17:46 10.KEGG_enrichment
drwxr-xr-x 2 train train 38 Jul 24 17:58 11.TF














网友评论