perl /PUBLIC/source/RNA/noRef/Advanced_analysis/script/cogAnalysis_v1.pl -i Phaeodactylum_tricornutum_NCBI/Phaeodactylum_tricornutum_NCBI_gene.fa -o ./
#!/usr/bin/perl -w use strict; use warnings; use Getopt::Long; use Bio::SearchIO; use File::Basename; use Cwd; my ($seqfile,$out,$help); GetOptions( "h|?|help" =>\$help, "i=s" => \$seqfile, "o=s" => \$out ); if(!defined($seqfile)||!defined($out)||defined($help)){ &usage; exit 0; } sub usage { print STDERR <<USAGE; ========================================================================= Description KOG annotation and analysis. perl kogAnalysis.pl -i <seq file> -o <output> -i <string> input sequence file. -o <string> output KOG annotation dir. -h,-help,? print this usage. ========================================================================== USAGE } unless(-d $out){ !system("mkdir $out") or warn "something goes wrong!\n"; } my $dir=getcwd(); my @suffix=qw(.fa .fasta .fna); my $basename=basename($seqfile,@suffix); my $blastfile="$out/$basename.cog.blast"; ###########################blast program################################ #system("formatdb -i /ngs/RNA/wujun/database/KOG/kyva -p T"); my $KOGDB="/PUBLIC/source/RNA/database/COG/myva"; #system("/opt/bio/ncbi/bin/blastall -p blastx -i $seqfile -o $blastfile -d $KOGDB -e 1e-3 -F \"m S\""); system("/PUBLIC/software/public/Alignment/ncbi-blast-2.2.28+/bin/blastx -query $seqfile -out $blastfile -db $KOGDB -evalue 1e-3 -seg yes -soft_masking true -num_threads 15"); ################################################# ## parse blast result ## ################################################# my $searchio=new Bio::SearchIO(-format => 'blast', -file => $blastfile); my $parsefile="$out/$basename.cog.parse"; open BLASTPARSE,">$parsefile" or die $!; while(my $result = $searchio->next_result){ print BLASTPARSE "gi|",$result->query_name,"|ref|XXXXXXXX\n"; while( my $hit = $result->next_hit){ print BLASTPARSE $hit->name," "; my $h_len =length($hit->name); my $m = 0; my $n = 0; while(my $hsp = $hit->next_hsp){ if($n>=1) {print BLASTPARSE " " x ($h_len+4);} print BLASTPARSE "(",$hsp->score," ",$hsp->evalue,") "; print BLASTPARSE $hsp->query->start,'..',$hsp->query->end," ",$hsp->hit->start,'..',$hsp->hit->end,"\n"; $m++; $n++; } if($m==0 && (!defined($hit->next_hsp))){ print BLASTPARSE "\n"; } } } close BLASTPARSE; my %kogann1=(); my %kogann2=(); my %category=(); my %kog_genes=(); ######################################## # for reading 'kog' into ram # ######################################## sub get_kog { open KOG, "</PUBLIC/source/RNA/database/COG/whog" or die "Can't open file: $!"; my %kog; my @functions; $/ = "\n["; while(my $record = <KOG>) { chomp $record; $record=~s/\[//; $record = '['.$record; my $function = {}; my $key; my $kog_num; my $value; my @proteins = (); for (split /\n/, $record) { if (defined $_ && /^(\[\w+\])\s+(COG\d{4})\s+(.*)/ ) { # Functions ($key, $value) = ($1, $3); # $key e.g. [OR],$value KOG 's description and function $kog_num = $2; $function->{$key} = $value; if($key=~/\[(.*?)\]/){ $category{$kog_num}=$1; # KOG 's category } } elsif (defined $_ && /^\s+?\w+?:\s{2}(.+)/) { push @proteins, $1; # KOG 's proteins e.g. At1g31340 } } $kogann1{$kog_num}=$key; # KOG [category] $kogann2{$kog_num}=$value; # KOG 's description and function $kog{$kog_num} = join ",", @proteins; push @functions, $function; # every KOG function } return (\%kog,\@functions); # KOG 's proteins and fuction close KOG; } ####################################### # which species # ####################################### sub which_species { my $name = $_[0]; # input KOG's fuction e.g. At1g31340 if ($name =~ /^At/) {return 'ath';} elsif ($name =~ /^CE/) {return 'cel';} elsif ($name =~ /^Hs/) {return 'hsa';} elsif ($name =~ /^Y/) {return 'sce';} elsif ($name =~ /^SP/) {return 'sp';} elsif ($name =~ /^ECU/) {return 'ecu';} else {return 'dme';} } ####################################### # dispose # # the BeTs in which kogs # # the species of each kog # ####################################### sub dispose { my %kog = %{$_[0]}; #{No. KOG} => (protein1,protein2,...,proteinX). KOG 's proteins e.g. At1g31340 my $ref_BeTs = $_[1]; #(BeT1,BeT2,...,BeTn) balst parse protein e.g. At4g34350 my $KOG_num; my $BeT; my $target; my %same_kog; #All BeTs in the same KOG: {No. KOG} => (BeT1,BeT2,...,BeTn) my @which_kog; #the KOGs which all the BeTs belongs to my %num; my %species; #what species in BeTs -- for the number of species in the behind my $i; for $BeT (@$ref_BeTs) { for $KOG_num (keys %kog) { for $target (split /,/, $kog{$KOG_num}) { my $same = 0; if ($BeT eq $target) { # blast parse protein in KOG print LOG "*****\t"; print LOG "$BeT => $KOG_num\t*****\n"; $same_kog{$KOG_num} .= $BeT.','; # the protein ($Bet) belongs to the same KOG ($KOG_num) for (@which_kog) { if ($_ eq $KOG_num) { $same = 1; last; } } if ($same == 0) { push @which_kog, $KOG_num; } else {next;} } else {next;} } next; } next; } for $KOG_num (@which_kog) { for (split /,/, $same_kog{$KOG_num}) { $species{&which_species($_)} = ''; # $_ is blast parse protein ($Bet),$Bet belongs to the species (the value is null) } $num{$KOG_num} = keys %species; # $num{$KOG_num} is an array } return (\@which_kog,\%num); } ####################################### # main # ####################################### my ($ref_kog,$ref_functions) = &get_kog; my $kogresult="$out/$basename.cog.out"; my $koglog="$out/$basename.coglog"; open BLASTP, "$parsefile" or die "Can't open file: $!"; open RESULT, ">$kogresult" or die "Can't open file: $!"; open LOG, ">$koglog" or die "Can't open file: $!"; $/ = "gi|"; while (my $record = <BLASTP>) { chomp $record; $record = "gi|".$record; my @BeTs; my $gi; for (split /\n/, $record) { if (/^gi\|(.+?)\|ref/) { $gi = $1; print LOG "gi = $gi\n"; next; } elsif (/^(.+?)\s\(\w+?\s(.+?)\)/) { my ($BeT, $E_value) = ($1, $2); if ($E_value <= 0.01) { #1e-3 changed at Mar, 30, 2008 by liz ,blast evalue 1e-3 push @BeTs, $BeT; } else { next; } } else { next; } } my ($ref_which_kog, $ref_num) = &dispose($ref_kog, \@BeTs); #ref_kog get from &get_kog, $ref_kog is proteins in KOG e.g.(protein1,protein2,...,proteinX),$ref_which_kog the KOG num ,$ref_num the number of KOG 's species numbers my $KOG_num; for $KOG_num (@$ref_which_kog) { print RESULT "$KOG_num: $$ref_num{$KOG_num}\n"; if ($$ref_num{$KOG_num} >= 3) { #print the KOGs and choose the KOGs in 3 species print RESULT "$gi in $KOG_num\n"; print RESULT "---------------------------------\n"; last; } } } close BLASTP; close RESULT; close LOG; my %kog_count; my @class=qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z); open RESULTS,"$kogresult" or die "Can't open file:$!"; my $kogannotation="$out/Annotation_COG.xls"; my $kogtxt="$out/$basename.cog.txt"; $/="\n"; my $desc="/PUBLIC/database/Common/KOG/kog_categories.txt"; my %desc; open DESC,"$desc" or die $!; while(<DESC>){ chomp; my @array=split/\t/; print $array[0]."\t".$array[1]."\n"; $desc{$array[0]}=$array[1]; print $array[0]."\t".$desc{$array[0]}."\n"; } close DESC; foreach my $key (keys %desc){ print "$key\t$desc{$key}\n"; } my $annotated_genes=0; open KOGANN,">$kogannotation"; print KOGANN "Gene ID\tCOG ID\tAbbreviation\tCOG Description\n"; open KOGTXT,">$kogtxt"; while(<RESULTS>){ chomp; if(/^(.+?)\sin\s(COG.+)/){ print KOGANN "$1\t$2\t$category{$2}\t$kogann2{$2}\n"; print KOGTXT "$1\t$2\n"; my $kogid=$2; my $geneid=$1; my @tmp=split "",$category{$kogid}; foreach(@tmp){ $kog_count{$_} +=1; $kog_genes{$_}{$geneid}=1; } $annotated_genes++; } } close RESULTS; close KOGANN; open OUT,">$out/COG_classification.xls"; open STAT,">$out/COG_classification_count.txt"; print OUT "Class\tAbbreviation\tGene Numbers\tGene IDs\n"; print STAT "###\tTotal annotated genes: $annotated_genes\n"; foreach my $key(sort keys %kog_count){ my %tmp=%{ $kog_genes{$key}}; print OUT "$desc{$key}\t".$key."\t".$kog_count{$key}."\t".join(",",sort keys %tmp)."\n"; print STAT $key."\t"."($key) $desc{$key}\t".$kog_count{$key}."\n"; } close STAT; close OUT; my $R= <<"END"; #======================================================================================================== setwd("$out") dat<-read.table("$out/COG_classification_count.txt",sep="\\t",skip=1); bardat<-(dat[,3]/$annotated_genes)*100 cog.des <- as.character(dat[,2]) names(bardat) <- as.character(dat[,1]) yymax<-max(bardat) yymax<-ceiling(yymax*1.2) col=rainbow(nrow(dat)) #def.par <- par() pdf("COG_classification.pdf",18,9) layout(matrix(c(1,2),nr=1),widths=c(30,15)) op1 <- par(mar=c(4.1,5.1,4.1,0)) a<-barplot(bardat,xaxt="n",col=col,ylim=c(0,yymax),cex.names=1.2,las=1,axis.lty=0,cex.axis=1) box() mtext("COG Classification",side=1,line=2,font=2,cex=0.8) mtext("Percent(%)",side=2,line=2.5,font=2,cex=0.8) title("COG Function Classification") par(cex.axis=0.6) axis(side=1,at=a,labels=dat[,1],padj=-2,tick=FALSE) op1 <- par(mar=c(0,0,1.1,1.1)) plot(0,0,pch="",xlim=c(0,1),ylim=c(0,yymax),bty="n",axes=F,xlab="",ylab="") for(i in length(cog.des):1){ text(0.01,i*yymax/(nrow(dat)+1),cog.des[length(cog.des)+1-i],pos=4,cex=1) } par(op1) #par(def.par) dev.off() png("COG_classification.png",width=1250,height=650,type="cairo-png") layout(matrix(c(1,2),nr=1),widths=c(30,15)) op1 <- par(mar=c(4.1,5.1,4.1,0)) a<-barplot(bardat,xaxt="n",col=col,ylim=c(0,yymax),cex.names=1.2,las=1,axis.lty=0,cex.axis=1) box() mtext("COG Classification",side=1,line=2,font=2,cex=0.8) mtext("Percent(%)",side=2,line=2.5,font=2,cex=0.8) title("COG Function Classification") par(cex.axis=0.6) axis(side=1,at=a,labels=dat[,1],padj=-2,tick=FALSE) op1 <- par(mar=c(0,0,1.1,1.1)) plot(0,0,pch="",xlim=c(0,1),ylim=c(0,yymax),bty="n",axes=F,xlab="",ylab="") for(i in length(cog.des):1){ text(0.01,i*yymax/(nrow(dat)+1),cog.des[length(cog.des)+1-i],pos=4,cex=1) } par(op1) #par(def.par) dev.off() #======================================================================================================== END open R,"|/PUBLIC/software/RNA/R-3.1.2/R-3.1.2/bin/R --vanilla --slave" or die $!; print R $R; close R; open RSCRIPT,">$out/COG_classification.R"; print RSCRIPT $R; close RSCRIPT;