/TJPROJ6/NC_BG_SH/personal_dir/lihang/know_lncRNA_class/second/know_lncRNA_class.sh /TJPROJ6/NC_BG_SH/personal_dir/lihang/know_lncRNA_class/second
利用cuffcompare,比较去除已知lncRNA的exon.gtf和已知lncRNA.gtf,根据class_code判断已知lncRNA的类型。
由于每个参考基因组gtf的格式会有不同,使用时需根据gtf文件依次执行每步。
# step 1 # exon.gtf know_lncRNA.gtf ln -s /TJPROJ6/RNA/Database/genome/Animal/Sus_scrofa/Ensembl_104/ncRNAseq/exon.gtf . ln -s /TJPROJ6/RNA/Database/genome/Animal/Sus_scrofa/Ensembl_104/ncRNAseq/lncRNA.gtf . # step 2 # 根据gene_biotype和transcript_biotype去掉exon.gtf中的lncRNA。 # 将transcript_biotype为此list中的列为lncRNA,lncRNA_tag_list='lncRNA,lincRNA,macro_lncRNA,retained_intron,antisense,antisense_RNA,sense_intronic,sense_overlapping' cat exon.gtf |grep -v lncRNA > exon.rm.lncrna.gtf # step 3 # 利用cuffcompare比较去掉lncRNA的exon.gtf和know_lncRNA.gtf。获得class_code(http://cole-trapnell-lab.github.io/cufflinks/cuffcompare/index.html) /PUBLIC/source/RNA/lncRNA/softwares/cufflinks-2.2.1_patched/cuffcompare -C -r exon.rm.lncrna.gtf lncRNA.gtf -o cuffcmp # step 4 # un_lnc_gene_type 两列"transcript_id\tgene_biotype" cat exon.rm.lncrna.gtf|awk -F '"' '{print $4"\t"$8}'|sort -u > un_lnc_gene_type # step 5 # 添加biotype到gtf.tmap awk 'NR==FNR{a[$1]=$2}NR>FNR{print a[$2]"\t"$0}' un_lnc_gene_type cuffcmp.lncRNA.gtf.tmap > cuffcmp.lncRNA.gtf.tmap.add.reftype # step 6 cat cuffcmp.lncRNA.gtf.tmap.add.reftype|cut -f 1-6 > cuffcmp.lncRNA.gtf.tmap.add.reftype.dict # step 7 # 根据class code对lncRNA分类 # cuffcmp.lncRNA.gtf.tmap.add.reftype.dict表头: # ref_gene_type ref_gene_id ref_id class_code cuff_gene_id cuff_id cat cuffcmp.lncRNA.gtf.tmap.add.reftype.dict |awk -F '\t' '$4=="u"' > linc.tmap cat cuffcmp.lncRNA.gtf.tmap.add.reftype.dict |awk -F '\t' '$4=="x"' > antisense.tmap cat cuffcmp.lncRNA.gtf.tmap.add.reftype.dict |awk -F '\t' '$4~/o|j/' > sense_overlapping.tmap cat cuffcmp.lncRNA.gtf.tmap.add.reftype.dict |awk -F '\t' '$4=="i"' > sense_intron.tmap cat cuffcmp.lncRNA.gtf.tmap.add.reftype.dict |awk -F '\t' '$4=="c"' > containing.tmap # step 8 # 统计数据并绘图 wc -l *tmap | grep -v 'total' |grep -v 'cuffcmp.lncRNA.gtf.tmap' > lncRNA_classification.txt.tmp awk -F ' ' '{print$2"\t"$1}' lncRNA_classification.txt.tmp| awk -F '.tmap' '{print$1$2}' > lncRNA_classification.txt sed -i 's/linc/lincRNA/g' lncRNA_classification.txt rm lncRNA_classification.txt.tmp Rscript /TJPROJ6/GB_TR/USER/jiangkai/pipline_updata_test/ncRNA/v4.2.0/bin/lncRNA_classify.R lncRNA_classification.txt .