/TJPROJ6/RNA_SH/personal_dir/zhangxin/jiaoben/zuotu/tsne_umap_pca_pcoa_nmds_plsda_plot
#!/TJPROJ6/RNA_SH/personal_dir/zhangxin/miniconda/envs/R_3.6.0/bin/Rscript
#@author: huangsonglin
#Email:huangsonglin@novogene.com
suppressMessages({
library(argparser)
library(vegan)
library(ape)
library(mixOmics)
library(ggrepel)
library(Rtsne)
library(umap)
library(ggplot2)})
argv <- arg_parser('')
argv <- add_argument(argv,"--fpkm", help="the fpkm file")
argv <- add_argument(argv,"--condition", help="the condition file")
argv <- add_argument(argv,"--prefix", help="the prefix of outfile")
argv <- add_argument(argv,"--type", help="one of tsne umap pca pcoa nmds or plsda")
argv <- add_argument(argv,"--add", help="add confidence interval",flag=TRUE)
argv <- parse_args(argv)
fpkm <- argv$fpkm
condition <- argv$condition
prefix <- argv$prefix
type <- argv$type
add <- argv$add
if(is.na(type)){type <- 'pca'}
fpkm <- read.delim(fpkm,header=TRUE,sep='\t',row.names=1,quote='')
condition <- read.delim(condition,header=TRUE,sep='\t',quote='')
groupdata <- condition[!duplicated(as.character(condition$sample)),]
rownames(groupdata) <- as.character(groupdata$sample)
sample <- as.character(unique(groupdata$sample))
group <- as.character(unique(groupdata$group))
groupdata$group <- factor(groupdata$group,levels=group)
data <- subset(fpkm,select=sample)
data <- data[rowSums(data)>1,]
data <- na.omit(data)
data <- log2(data+1)
if (type == "pca"){
data <- scale(data)
data <- t(scale(t(data),center=TRUE,scale=F))
pca <- prcomp(t(data),center=FALSE,scale.=FALSE)
pca_data <- data.frame(pc1=pca$x[,1],pc2=pca$x[,2],sample=colnames(data),group=groupdata[colnames(data),'group'])
pct <- (pca$sdev^2)/sum(pca$sdev^2)
pc1 <- sprintf("(%.2f%%)",pct[1]*100)
pc2 <- sprintf("(%.2f%%)",pct[2]*100)
p <- ggplot(pca_data,aes(x=pc1,y=pc2,colour=group)) +
geom_point(size=4) +geom_text_repel(aes(pc1,pc2,label=sample),size=4) +
labs(x=paste('PC1 ',pc1,sep=''),y=paste('PC2 ',pc2,sep='')) +
geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') +
geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') +
theme_bw() +
theme(plot.title=element_text(hjust=0.5))
prefix <- paste0(prefix,'_pca')
write.table(pca_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F)
}else if (type == "pcoa"){
data_dist <- vegdist(as.data.frame(t(data)))
Pcoa <- pcoa(data_dist,correction="none")
Pcoa_data <- data.frame(pcoa1=Pcoa$vectors[,1], pcoa2=Pcoa$vectors[,2],sample=colnames(data),group=groupdata[colnames(data),'group'])
pca_data <- Pcoa_data
colnames(pca_data)[1:2] <- c('pc1','pc2')
p <- ggplot(Pcoa_data,aes(x=pcoa1,y=pcoa2,colour=group)) +
geom_point(size=4) +geom_text_repel(aes(pcoa1,pcoa2,label=sample),size=4) +
labs(x="PCOA1",y="PCOA2") +
geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') +
geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') +
theme_bw() +
theme(plot.title=element_text(hjust=0.5))
prefix <- paste0(prefix,'_pcoa')
write.table(Pcoa_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F)
}else if (type == "nmds"){
data <- as.data.frame(t(data))
nmds <- metaMDS(data, distance = 'bray', k = 4)
nmds_data <- data.frame(NMDS1=nmds$points[,1], NMDS2=nmds$points[,2],sample=rownames(data),group=groupdata[rownames(data),'group'])
pca_data <- nmds_data
colnames(pca_data)[1:2] <- c('pc1','pc2')
p <- ggplot(nmds_data,aes(x=NMDS1,y=NMDS2,colour=group)) +
geom_point(size=4) +geom_text_repel(aes(NMDS1,NMDS2,label=sample),size=4) +
labs(x="NMDS1",y="NMDS2") +
geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') +
geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') +
theme_bw() +
theme(plot.title=element_text(hjust=0.5))
prefix <- paste0(prefix,'_ndms')
write.table(nmds_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F)
}else if (type == "tsne"){
data <- as.matrix(t(data))
print(dim(data))
set.seed(0)
rtsne <- Rtsne(data,pca=FALSE,perplexity=floor((nrow(data)-1)/3),theta=0.0)
rtsne_data <- data.frame(tsne1 = rtsne$Y[,1],tsne2=rtsne$Y[,2],sample=rownames(data),group=groupdata[rownames(data),'group'])
pca_data <- rtsne_data
colnames(pca_data)[1:2] <- c('pc1','pc2')
p <- ggplot(rtsne_data,aes(x=tsne1,y=tsne2,colour=group)) +
geom_point(size=4) +geom_text_repel(aes(tsne1,tsne2,label=sample),size=4) +
labs(x="tsne1",y="tsne2") +
geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') +
geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') +
theme_bw() +
theme(plot.title=element_text(hjust=0.5))
prefix <- paste0(prefix,'_tsne')
write.table(rtsne_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F)
}else if (type == "umap"){
data <- as.matrix(t(data))
umap_out <- umap(data,preserve.seed=F,n_neighbors=(nrow(data)-1))
umap_data <- data.frame(umap1=umap_out$layout[,1],umap2=umap_out$layout[,2],sample=rownames(data),group=groupdata[rownames(data),'group'])
pca_data <- umap_data
colnames(pca_data)[1:2] <- c('pc1','pc2')
p <- ggplot(umap_data,aes(x=umap1,y=umap2,colour=group)) +
geom_point(size=4) +geom_text_repel(aes(umap1,umap2,label=sample),size=4) +
labs(x="umap1",y="umap2") +
geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') +
geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') +
theme_bw() +
theme(plot.title=element_text(hjust=0.5))
prefix <- paste0(prefix,'_umap')
write.table(umap_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F)
}else if (type == "plsda"){
data <- as.matrix(t(data))
Y <- groupdata$group
names(Y) <- groupdata$sample
print(Y)
Y <- Y[rownames(data)]
plsda_out <- plsda(data, Y, ncomp = 3)
plsda_data <- data.frame(comp1=plsda_out$variates$X[,1],comp2=plsda_out$variates$X[,2],sample=rownames(data),group=groupdata[rownames(data),'group'])
pca_data <- plsda_data
colnames(pca_data)[1:2] <- c('pc1','pc2')
Comp1 <- round(plsda_out$explained_variance$X[1] * 100,2)
Comp2 <- round(plsda_out$explained_variance$X[2] * 100,2)
p <- ggplot(plsda_data,aes(x=comp1,y=comp2,colour=group)) +
geom_point(size=4) +geom_text_repel(aes(comp1,comp2,label=sample),size=4) +
labs(x=paste0("comp1(",Comp1," %)"),y=paste0("comp2(",Comp2," %)")) +
geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') +
geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') +
theme_bw() +
theme(plot.title=element_text(hjust=0.5))
prefix <- paste0(prefix,'_plsda')
write.table(plsda_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F)
}else{
print('type must be one of "pca" , "pcoa" , "ndms" , "tsne" , "umap" , "plsda"')
q()
}
if(add){
if(min(table(as.character(groupdata$group))) > 3){
p <- p + stat_ellipse(aes(x=pc1,y=pc2,colour=group),data=pca_data,level = 0.95,show.legend = FALSE)
}else{
p <- p + ggforce::geom_mark_ellipse(aes(x=pc1,y=pc2,colour=group),data=pca_data,show.legend = FALSE)
}
}
pdf(file=paste(prefix,'.pdf',sep=''))
p
dev.off()
svg(file=paste(prefix,'.svg',sep=''))
p
dev.off()
ggsave(file=paste(prefix,'.png',sep=''),type='cairo-png',plot=p)
### TSNE AND UMAP 降维分析
## 统一流形逼近与投影(UMAP, Uniform Manifold Approximation and Projection)
################################################################################
### load libraries
library(ggplot2)
library(umap)
library(Rtsne)
library(cowplot)
library(vegan)
setwd("/Users/fengjie/Desktop/Research/UMAP_TSNE")
data <- read.table("gene_fpkm.xls",header = T,sep = "\t",row.names = 1)
group <- read.table("condition.xls",header = T,sep = "\t",row.names = 1)
dat <- data[,rownames(group)]
dat <- t(dat)
################################################################################
#UMAP
#methods:euclidean,manhattan,braycurtis etc
set.seed(123)
dist <- vegdist(dat,method = "bray",na.rm=TRUE)
dist <- data.matrix(dist)
## samller N-neighbors, points more clustered
umap <- umap(dist,n_components=2,n_neighbors=8)
umap.score <- umap$layout
umap.score <- as.data.frame(umap.score)
colnames(umap.score) <- c("UMAP1","UMAP2")
umap.score$group = group$group
color <- c( "#3c5488B2","#00A087B2",
"#F39B7FB2","#91D1C2B2",
"#8491B4B2","#DC0000B2",
"#7E6148B2","yellow",
"darkolivegreen1","lightskyblue",
"darkgreen","deeppink","khaki2",
"firebrick","brown1","darksalmon",
"darkgoldenrod1","darkseagreen","darkorchid")
################################################################################
#plot
pl <- ggplot(umap.score, aes(UMAP1,UMAP2,color=group))+
theme_classic()+
geom_vline(xintercept = 0, color = "grey", size = 0.3) +
geom_hline(yintercept = 0, color = "grey", size = 0.3) +
geom_point(size = 4)+
scale_color_manual(values = color[1:length(unique(group$group))]) +
theme(panel.grid = element_line(color = "grey",linetype = 2,size = 0.1),
panel.background = element_rect(color = "black", fill = "transparent"),
legend.title = element_blank(),
legend.text = element_text(size = 12),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12))+
labs(x = "UMAP1", y = "UMAP2")
################################################################################
######
## t-SNE(t-distributed stochastic neighbor embedding)T分布随机近邻嵌入
#是用于降维的一种学习算法.
# TSNE
set.seed(123)
## smaller perplexity, more clustered points
tsne.out <- Rtsne(dat,dims=2,pca=FALSE,perplexity = 3)
tsne <- as.data.frame(tsne.out$Y)
colnames(tsne) <- c("tsne1","tsne2")
rownames(tsne) <- rownames(group)
tsne$group <- factor(group$group)
p2 <- ggplot(tsne,aes(tsne1,tsne2,color = group)) +
theme_classic() +
geom_point(size=4) +
scale_color_manual(values = color) +
geom_vline(xintercept = 0, color = "grey", size = 0.5) +
geom_hline(yintercept = 0, color = "grey", size = 0.3) +
theme(legend.text = element_text(size = 12),
axis.text = element_text(size = 10),
panel.grid = element_line(color = "grey", linetype = 2, size = 0.1),
panel.background = element_rect(color = "black", fill = "transparent"),
legend.title = element_blank()) +
labs(x="TSNE1",y="TSNE2")
out <- plot_grid(pl,p2,NULL,lables = c("A","B",""),ncol = 2)
pdf("UMAP_TSNE.pdf")
out
dev.off()