COG注释

perl /PUBLIC/source/RNA/noRef/Advanced_analysis/script/cogAnalysis_v1.pl -i Phaeodactylum_tricornutum_NCBI/Phaeodactylum_tricornutum_NCBI_gene.fa -o ./
#!/usr/bin/perl -w
use strict;
use warnings;
use Getopt::Long;
use Bio::SearchIO;
use File::Basename;
use Cwd;

my ($seqfile,$out,$help);
GetOptions(
	"h|?|help" =>\$help,
	"i=s" => \$seqfile,
	"o=s" => \$out
);
if(!defined($seqfile)||!defined($out)||defined($help)){
	&usage;
	exit 0;
}

sub usage 
{
	print STDERR <<USAGE;
=========================================================================
Description KOG annotation and analysis.
perl kogAnalysis.pl -i <seq file> -o <output>
	-i	<string>	input sequence file.
	-o	<string>	output KOG annotation dir.
	-h,-help,?		print this usage.

==========================================================================
USAGE
}
unless(-d $out){
	!system("mkdir $out") or warn "something goes wrong!\n";
}

my $dir=getcwd();
my @suffix=qw(.fa .fasta .fna);
my $basename=basename($seqfile,@suffix);
my $blastfile="$out/$basename.cog.blast";
###########################blast program################################
#system("formatdb -i /ngs/RNA/wujun/database/KOG/kyva -p T");
my $KOGDB="/PUBLIC/source/RNA/database/COG/myva";
#system("/opt/bio/ncbi/bin/blastall -p blastx -i $seqfile -o $blastfile -d $KOGDB -e 1e-3 -F \"m S\"");
system("/PUBLIC/software/public/Alignment/ncbi-blast-2.2.28+/bin/blastx -query $seqfile -out $blastfile -db $KOGDB -evalue 1e-3 -seg yes -soft_masking true -num_threads 15");
#################################################
##         parse blast result                  ##
#################################################
my $searchio=new Bio::SearchIO(-format => 'blast',
				-file => $blastfile);
my $parsefile="$out/$basename.cog.parse";
open BLASTPARSE,">$parsefile" or die $!;
while(my $result = $searchio->next_result){
	print BLASTPARSE "gi|",$result->query_name,"|ref|XXXXXXXX\n";
	while( my $hit = $result->next_hit){
		print BLASTPARSE $hit->name," ";
			my $h_len =length($hit->name);
			my $m = 0;
			my $n = 0;
			while(my $hsp = $hit->next_hsp){
				if($n>=1) {print BLASTPARSE " " x ($h_len+4);}
				print BLASTPARSE "(",$hsp->score," ",$hsp->evalue,")  ";
				print BLASTPARSE $hsp->query->start,'..',$hsp->query->end,"  ",$hsp->hit->start,'..',$hsp->hit->end,"\n";
				$m++;
				$n++;
			}
			if($m==0 && (!defined($hit->next_hsp))){
				print BLASTPARSE "\n";
			}
	}
}
close BLASTPARSE;
my %kogann1=();
my %kogann2=();
my %category=();
my %kog_genes=();
########################################
#      for reading 'kog' into ram      #
########################################
sub get_kog {
	open KOG, "</PUBLIC/source/RNA/database/COG/whog"
		or die "Can't open file: $!";
	my %kog;
	my @functions;
	$/ = "\n[";
	while(my $record = <KOG>) {
		chomp $record;
		$record=~s/\[//;
		$record = '['.$record;
		my $function = {};
		my $key;
		my $kog_num;

		my $value;
		my @proteins = ();
		for (split /\n/, $record) {
			if (defined $_ && /^(\[\w+\])\s+(COG\d{4})\s+(.*)/ ) {		# Functions
				($key, $value) = ($1, $3);					# $key e.g. [OR],$value KOG 's description and function
				$kog_num = $2;
				$function->{$key} = $value;
				if($key=~/\[(.*?)\]/){
					$category{$kog_num}=$1;				# KOG 's category
				}
			}
			elsif (defined $_ && /^\s+?\w+?:\s{2}(.+)/) {
				push @proteins, $1;						# KOG 's proteins e.g. At1g31340 
			}
		}
		$kogann1{$kog_num}=$key;							# KOG [category]
		$kogann2{$kog_num}=$value;							# KOG 's description and function
		$kog{$kog_num} = join ",", @proteins;
		push @functions, $function;							# every KOG function
	}
	
	return (\%kog,\@functions);								# KOG 's proteins and fuction
	close KOG;
}
#######################################
#           which species             #
#######################################
sub which_species {
	my $name = $_[0];									# input KOG's fuction e.g. At1g31340 
	if ($name =~ /^At/) {return 'ath';}
	elsif ($name =~ /^CE/) {return 'cel';}
	elsif ($name =~ /^Hs/) {return 'hsa';}
	elsif ($name =~ /^Y/) {return 'sce';}
	elsif ($name =~ /^SP/) {return 'sp';}
	elsif ($name =~ /^ECU/) {return 'ecu';}
	else {return 'dme';}
}
#######################################
#              dispose                #
#      the BeTs in which kogs         #
#      the species of each kog        #
#######################################
sub dispose {
	my %kog = %{$_[0]};													    #{No. KOG} => (protein1,protein2,...,proteinX).  KOG 's proteins e.g. At1g31340
	my $ref_BeTs = $_[1];													#(BeT1,BeT2,...,BeTn) balst parse protein e.g. At4g34350
	my $KOG_num;
	my $BeT;
	my $target;
	my %same_kog;														    #All BeTs in the same KOG: {No. KOG} => (BeT1,BeT2,...,BeTn)
	my @which_kog;														    #the KOGs which all the BeTs belongs to
	my %num;
	my %species;														    #what species in BeTs -- for the number of species in the behind 
	my $i;
	for $BeT (@$ref_BeTs) {
		for $KOG_num (keys %kog) {
			for $target (split /,/, $kog{$KOG_num}) {
				my $same = 0;
				if ($BeT eq $target) {										# blast parse protein in KOG
					print LOG "*****\t";
					print LOG "$BeT => $KOG_num\t*****\n";
					$same_kog{$KOG_num} .= $BeT.',';							    # the protein ($Bet) belongs to the same KOG ($KOG_num) 
					for (@which_kog) {
						if ($_ eq $KOG_num) {
							$same = 1;
							last;
						}
					}
					if ($same == 0) {
						push @which_kog, $KOG_num;
					}
					else {next;}
				}
				else {next;}
			}
			next;
		}
		next;
	}
	for $KOG_num (@which_kog) {
		for (split /,/, $same_kog{$KOG_num}) {
			$species{&which_species($_)} = '';									    # $_ is blast parse protein ($Bet),$Bet belongs to the species (the value is null)
		}
		$num{$KOG_num} = keys %species;											# $num{$KOG_num} is an array
	}
	return (\@which_kog,\%num);
}
#######################################
#                main                 #
#######################################
my ($ref_kog,$ref_functions) = &get_kog;
my $kogresult="$out/$basename.cog.out";
my $koglog="$out/$basename.coglog";

open BLASTP, "$parsefile"
	or die "Can't open file: $!";
open RESULT, ">$kogresult"
	or die "Can't open file: $!";
open LOG, ">$koglog"
	or die "Can't open file: $!";
$/ = "gi|";
while (my $record = <BLASTP>) {
	chomp $record;
	$record = "gi|".$record;
	my @BeTs;
	my $gi;
	for (split /\n/, $record) {
		if (/^gi\|(.+?)\|ref/) {
			$gi = $1;
			print LOG "gi = $gi\n";
			next;
		}
		elsif (/^(.+?)\s\(\w+?\s(.+?)\)/) {
			my ($BeT, $E_value) = ($1, $2);
			if ($E_value <= 0.01) {								#1e-3 changed at Mar, 30, 2008 by liz ,blast evalue 1e-3 
				push @BeTs, $BeT;
			}
			else {
				next;
			}
		}
		else {
			next;
		}
	}
	my ($ref_which_kog, $ref_num) = &dispose($ref_kog, \@BeTs);					#ref_kog get from &get_kog, $ref_kog is proteins in KOG e.g.(protein1,protein2,...,proteinX),$ref_which_kog the KOG num ,$ref_num the number of KOG 's species numbers
	my $KOG_num;
	for $KOG_num (@$ref_which_kog) {
		print RESULT "$KOG_num: $$ref_num{$KOG_num}\n";
		if ($$ref_num{$KOG_num} >= 3) {								#print  the KOGs and choose the KOGs in 3 species
			print RESULT "$gi in $KOG_num\n";
			print RESULT "---------------------------------\n";
			last;
		}
	}
}
close BLASTP;
close RESULT;
close LOG;
my %kog_count;
my @class=qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z);
open RESULTS,"$kogresult" or die "Can't open file:$!";
my $kogannotation="$out/Annotation_COG.xls";
my $kogtxt="$out/$basename.cog.txt";
$/="\n";
my $desc="/PUBLIC/database/Common/KOG/kog_categories.txt";
my %desc;
open DESC,"$desc" or die $!;
while(<DESC>){
	chomp;
	my @array=split/\t/;
	print $array[0]."\t".$array[1]."\n";
	$desc{$array[0]}=$array[1];
	print $array[0]."\t".$desc{$array[0]}."\n";
}
close DESC;
foreach  my $key (keys %desc){
	print "$key\t$desc{$key}\n";
}
my $annotated_genes=0;
open KOGANN,">$kogannotation";
print KOGANN "Gene ID\tCOG ID\tAbbreviation\tCOG Description\n";
open KOGTXT,">$kogtxt";
while(<RESULTS>){
	chomp;
	if(/^(.+?)\sin\s(COG.+)/){
		print KOGANN "$1\t$2\t$category{$2}\t$kogann2{$2}\n";
		print KOGTXT "$1\t$2\n";
		my $kogid=$2;
		my $geneid=$1;
		my @tmp=split "",$category{$kogid};
		foreach(@tmp){
			$kog_count{$_} +=1;
			$kog_genes{$_}{$geneid}=1;
		}
	$annotated_genes++;
	}
}
close RESULTS;
close KOGANN;

open OUT,">$out/COG_classification.xls";
open STAT,">$out/COG_classification_count.txt";
print OUT "Class\tAbbreviation\tGene Numbers\tGene IDs\n";
print STAT "###\tTotal annotated genes: $annotated_genes\n";
foreach my $key(sort keys %kog_count){
	my %tmp=%{ $kog_genes{$key}};
	print OUT "$desc{$key}\t".$key."\t".$kog_count{$key}."\t".join(",",sort keys %tmp)."\n";
	print STAT $key."\t"."($key) $desc{$key}\t".$kog_count{$key}."\n";
}
close STAT;
close OUT;

my $R= <<"END";
#========================================================================================================
setwd("$out")
dat<-read.table("$out/COG_classification_count.txt",sep="\\t",skip=1);
bardat<-(dat[,3]/$annotated_genes)*100
cog.des <- as.character(dat[,2])
names(bardat) <- as.character(dat[,1])
yymax<-max(bardat)
yymax<-ceiling(yymax*1.2)
col=rainbow(nrow(dat))
#def.par <- par()
pdf("COG_classification.pdf",18,9)
layout(matrix(c(1,2),nr=1),widths=c(30,15))
op1 <- par(mar=c(4.1,5.1,4.1,0))
a<-barplot(bardat,xaxt="n",col=col,ylim=c(0,yymax),cex.names=1.2,las=1,axis.lty=0,cex.axis=1)
box()
mtext("COG Classification",side=1,line=2,font=2,cex=0.8)
mtext("Percent(%)",side=2,line=2.5,font=2,cex=0.8)
title("COG Function Classification")
par(cex.axis=0.6)
axis(side=1,at=a,labels=dat[,1],padj=-2,tick=FALSE)
op1 <- par(mar=c(0,0,1.1,1.1))
plot(0,0,pch="",xlim=c(0,1),ylim=c(0,yymax),bty="n",axes=F,xlab="",ylab="")
for(i in length(cog.des):1){
        text(0.01,i*yymax/(nrow(dat)+1),cog.des[length(cog.des)+1-i],pos=4,cex=1)
}
par(op1)
#par(def.par)
dev.off()

png("COG_classification.png",width=1250,height=650,type="cairo-png")
layout(matrix(c(1,2),nr=1),widths=c(30,15))
op1 <- par(mar=c(4.1,5.1,4.1,0))
a<-barplot(bardat,xaxt="n",col=col,ylim=c(0,yymax),cex.names=1.2,las=1,axis.lty=0,cex.axis=1)
box()
mtext("COG Classification",side=1,line=2,font=2,cex=0.8)
mtext("Percent(%)",side=2,line=2.5,font=2,cex=0.8)
title("COG Function Classification")
par(cex.axis=0.6)
axis(side=1,at=a,labels=dat[,1],padj=-2,tick=FALSE)
op1 <- par(mar=c(0,0,1.1,1.1))
plot(0,0,pch="",xlim=c(0,1),ylim=c(0,yymax),bty="n",axes=F,xlab="",ylab="")
for(i in length(cog.des):1){
        text(0.01,i*yymax/(nrow(dat)+1),cog.des[length(cog.des)+1-i],pos=4,cex=1)
}
par(op1)
#par(def.par)
dev.off()
#========================================================================================================
END
open R,"|/PUBLIC/software/RNA/R-3.1.2/R-3.1.2/bin/R --vanilla --slave" or die $!;
print R $R;
close R;

open RSCRIPT,">$out/COG_classification.R";
print RSCRIPT $R;
close RSCRIPT;