/TJPROJ6/RNA_SH/personal_dir/zhangxin/jiaoben/zuotu/tsne_umap_pca_pcoa_nmds_plsda_plot #!/TJPROJ6/RNA_SH/personal_dir/zhangxin/miniconda/envs/R_3.6.0/bin/Rscript #@author: huangsonglin #Email:huangsonglin@novogene.com suppressMessages({ library(argparser) library(vegan) library(ape) library(mixOmics) library(ggrepel) library(Rtsne) library(umap) library(ggplot2)}) argv <- arg_parser('') argv <- add_argument(argv,"--fpkm", help="the fpkm file") argv <- add_argument(argv,"--condition", help="the condition file") argv <- add_argument(argv,"--prefix", help="the prefix of outfile") argv <- add_argument(argv,"--type", help="one of tsne umap pca pcoa nmds or plsda") argv <- add_argument(argv,"--add", help="add confidence interval",flag=TRUE) argv <- parse_args(argv) fpkm <- argv$fpkm condition <- argv$condition prefix <- argv$prefix type <- argv$type add <- argv$add if(is.na(type)){type <- 'pca'} fpkm <- read.delim(fpkm,header=TRUE,sep='\t',row.names=1,quote='') condition <- read.delim(condition,header=TRUE,sep='\t',quote='') groupdata <- condition[!duplicated(as.character(condition$sample)),] rownames(groupdata) <- as.character(groupdata$sample) sample <- as.character(unique(groupdata$sample)) group <- as.character(unique(groupdata$group)) groupdata$group <- factor(groupdata$group,levels=group) data <- subset(fpkm,select=sample) data <- data[rowSums(data)>1,] data <- na.omit(data) data <- log2(data+1) if (type == "pca"){ data <- scale(data) data <- t(scale(t(data),center=TRUE,scale=F)) pca <- prcomp(t(data),center=FALSE,scale.=FALSE) pca_data <- data.frame(pc1=pca$x[,1],pc2=pca$x[,2],sample=colnames(data),group=groupdata[colnames(data),'group']) pct <- (pca$sdev^2)/sum(pca$sdev^2) pc1 <- sprintf("(%.2f%%)",pct[1]*100) pc2 <- sprintf("(%.2f%%)",pct[2]*100) p <- ggplot(pca_data,aes(x=pc1,y=pc2,colour=group)) + geom_point(size=4) +geom_text_repel(aes(pc1,pc2,label=sample),size=4) + labs(x=paste('PC1 ',pc1,sep=''),y=paste('PC2 ',pc2,sep='')) + geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') + geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') + theme_bw() + theme(plot.title=element_text(hjust=0.5)) prefix <- paste0(prefix,'_pca') write.table(pca_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F) }else if (type == "pcoa"){ data_dist <- vegdist(as.data.frame(t(data))) Pcoa <- pcoa(data_dist,correction="none") Pcoa_data <- data.frame(pcoa1=Pcoa$vectors[,1], pcoa2=Pcoa$vectors[,2],sample=colnames(data),group=groupdata[colnames(data),'group']) pca_data <- Pcoa_data colnames(pca_data)[1:2] <- c('pc1','pc2') p <- ggplot(Pcoa_data,aes(x=pcoa1,y=pcoa2,colour=group)) + geom_point(size=4) +geom_text_repel(aes(pcoa1,pcoa2,label=sample),size=4) + labs(x="PCOA1",y="PCOA2") + geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') + geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') + theme_bw() + theme(plot.title=element_text(hjust=0.5)) prefix <- paste0(prefix,'_pcoa') write.table(Pcoa_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F) }else if (type == "nmds"){ data <- as.data.frame(t(data)) nmds <- metaMDS(data, distance = 'bray', k = 4) nmds_data <- data.frame(NMDS1=nmds$points[,1], NMDS2=nmds$points[,2],sample=rownames(data),group=groupdata[rownames(data),'group']) pca_data <- nmds_data colnames(pca_data)[1:2] <- c('pc1','pc2') p <- ggplot(nmds_data,aes(x=NMDS1,y=NMDS2,colour=group)) + geom_point(size=4) +geom_text_repel(aes(NMDS1,NMDS2,label=sample),size=4) + labs(x="NMDS1",y="NMDS2") + geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') + geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') + theme_bw() + theme(plot.title=element_text(hjust=0.5)) prefix <- paste0(prefix,'_ndms') write.table(nmds_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F) }else if (type == "tsne"){ data <- as.matrix(t(data)) print(dim(data)) set.seed(0) rtsne <- Rtsne(data,pca=FALSE,perplexity=floor((nrow(data)-1)/3),theta=0.0) rtsne_data <- data.frame(tsne1 = rtsne$Y[,1],tsne2=rtsne$Y[,2],sample=rownames(data),group=groupdata[rownames(data),'group']) pca_data <- rtsne_data colnames(pca_data)[1:2] <- c('pc1','pc2') p <- ggplot(rtsne_data,aes(x=tsne1,y=tsne2,colour=group)) + geom_point(size=4) +geom_text_repel(aes(tsne1,tsne2,label=sample),size=4) + labs(x="tsne1",y="tsne2") + geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') + geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') + theme_bw() + theme(plot.title=element_text(hjust=0.5)) prefix <- paste0(prefix,'_tsne') write.table(rtsne_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F) }else if (type == "umap"){ data <- as.matrix(t(data)) umap_out <- umap(data,preserve.seed=F,n_neighbors=(nrow(data)-1)) umap_data <- data.frame(umap1=umap_out$layout[,1],umap2=umap_out$layout[,2],sample=rownames(data),group=groupdata[rownames(data),'group']) pca_data <- umap_data colnames(pca_data)[1:2] <- c('pc1','pc2') p <- ggplot(umap_data,aes(x=umap1,y=umap2,colour=group)) + geom_point(size=4) +geom_text_repel(aes(umap1,umap2,label=sample),size=4) + labs(x="umap1",y="umap2") + geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') + geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') + theme_bw() + theme(plot.title=element_text(hjust=0.5)) prefix <- paste0(prefix,'_umap') write.table(umap_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F) }else if (type == "plsda"){ data <- as.matrix(t(data)) Y <- groupdata$group names(Y) <- groupdata$sample print(Y) Y <- Y[rownames(data)] plsda_out <- plsda(data, Y, ncomp = 3) plsda_data <- data.frame(comp1=plsda_out$variates$X[,1],comp2=plsda_out$variates$X[,2],sample=rownames(data),group=groupdata[rownames(data),'group']) pca_data <- plsda_data colnames(pca_data)[1:2] <- c('pc1','pc2') Comp1 <- round(plsda_out$explained_variance$X[1] * 100,2) Comp2 <- round(plsda_out$explained_variance$X[2] * 100,2) p <- ggplot(plsda_data,aes(x=comp1,y=comp2,colour=group)) + geom_point(size=4) +geom_text_repel(aes(comp1,comp2,label=sample),size=4) + labs(x=paste0("comp1(",Comp1," %)"),y=paste0("comp2(",Comp2," %)")) + geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') + geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') + theme_bw() + theme(plot.title=element_text(hjust=0.5)) prefix <- paste0(prefix,'_plsda') write.table(plsda_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F) }else{ print('type must be one of "pca" , "pcoa" , "ndms" , "tsne" , "umap" , "plsda"') q() } if(add){ if(min(table(as.character(groupdata$group))) > 3){ p <- p + stat_ellipse(aes(x=pc1,y=pc2,colour=group),data=pca_data,level = 0.95,show.legend = FALSE) }else{ p <- p + ggforce::geom_mark_ellipse(aes(x=pc1,y=pc2,colour=group),data=pca_data,show.legend = FALSE) } } pdf(file=paste(prefix,'.pdf',sep='')) p dev.off() svg(file=paste(prefix,'.svg',sep='')) p dev.off() ggsave(file=paste(prefix,'.png',sep=''),type='cairo-png',plot=p) ### TSNE AND UMAP 降维分析 ## 统一流形逼近与投影(UMAP, Uniform Manifold Approximation and Projection) ################################################################################ ### load libraries library(ggplot2) library(umap) library(Rtsne) library(cowplot) library(vegan) setwd("/Users/fengjie/Desktop/Research/UMAP_TSNE") data <- read.table("gene_fpkm.xls",header = T,sep = "\t",row.names = 1) group <- read.table("condition.xls",header = T,sep = "\t",row.names = 1) dat <- data[,rownames(group)] dat <- t(dat) ################################################################################ #UMAP #methods:euclidean,manhattan,braycurtis etc set.seed(123) dist <- vegdist(dat,method = "bray",na.rm=TRUE) dist <- data.matrix(dist) ## samller N-neighbors, points more clustered umap <- umap(dist,n_components=2,n_neighbors=8) umap.score <- umap$layout umap.score <- as.data.frame(umap.score) colnames(umap.score) <- c("UMAP1","UMAP2") umap.score$group = group$group color <- c( "#3c5488B2","#00A087B2", "#F39B7FB2","#91D1C2B2", "#8491B4B2","#DC0000B2", "#7E6148B2","yellow", "darkolivegreen1","lightskyblue", "darkgreen","deeppink","khaki2", "firebrick","brown1","darksalmon", "darkgoldenrod1","darkseagreen","darkorchid") ################################################################################ #plot pl <- ggplot(umap.score, aes(UMAP1,UMAP2,color=group))+ theme_classic()+ geom_vline(xintercept = 0, color = "grey", size = 0.3) + geom_hline(yintercept = 0, color = "grey", size = 0.3) + geom_point(size = 4)+ scale_color_manual(values = color[1:length(unique(group$group))]) + theme(panel.grid = element_line(color = "grey",linetype = 2,size = 0.1), panel.background = element_rect(color = "black", fill = "transparent"), legend.title = element_blank(), legend.text = element_text(size = 12), axis.text = element_text(size = 10), axis.title = element_text(size = 12))+ labs(x = "UMAP1", y = "UMAP2") ################################################################################ ###### ## t-SNE(t-distributed stochastic neighbor embedding)T分布随机近邻嵌入 #是用于降维的一种学习算法. # TSNE set.seed(123) ## smaller perplexity, more clustered points tsne.out <- Rtsne(dat,dims=2,pca=FALSE,perplexity = 3) tsne <- as.data.frame(tsne.out$Y) colnames(tsne) <- c("tsne1","tsne2") rownames(tsne) <- rownames(group) tsne$group <- factor(group$group) p2 <- ggplot(tsne,aes(tsne1,tsne2,color = group)) + theme_classic() + geom_point(size=4) + scale_color_manual(values = color) + geom_vline(xintercept = 0, color = "grey", size = 0.5) + geom_hline(yintercept = 0, color = "grey", size = 0.3) + theme(legend.text = element_text(size = 12), axis.text = element_text(size = 10), panel.grid = element_line(color = "grey", linetype = 2, size = 0.1), panel.background = element_rect(color = "black", fill = "transparent"), legend.title = element_blank()) + labs(x="TSNE1",y="TSNE2") out <- plot_grid(pl,p2,NULL,lables = c("A","B",""),ncol = 2) pdf("UMAP_TSNE.pdf") out dev.off()