用户工具

站点工具


个性化条目:tsne与umap聚类分析

/TJPROJ6/RNA_SH/personal_dir/zhangxin/jiaoben/zuotu/tsne_umap_pca_pcoa_nmds_plsda_plot

#!/TJPROJ6/RNA_SH/personal_dir/zhangxin/miniconda/envs/R_3.6.0/bin/Rscript
#@author: huangsonglin
#Email:huangsonglin@novogene.com

suppressMessages({
library(argparser)
library(vegan)
library(ape)
library(mixOmics)
library(ggrepel)
library(Rtsne)
library(umap)
library(ggplot2)})

argv <- arg_parser('')
argv <- add_argument(argv,"--fpkm", help="the fpkm file")
argv <- add_argument(argv,"--condition", help="the condition file")
argv <- add_argument(argv,"--prefix", help="the prefix of outfile")
argv <- add_argument(argv,"--type", help="one of tsne umap pca pcoa nmds or plsda")
argv <- add_argument(argv,"--add", help="add confidence interval",flag=TRUE)
argv <- parse_args(argv)

fpkm <- argv$fpkm
condition <- argv$condition
prefix <- argv$prefix
type <- argv$type
add <- argv$add

if(is.na(type)){type <- 'pca'}
fpkm <- read.delim(fpkm,header=TRUE,sep='\t',row.names=1,quote='')
condition <- read.delim(condition,header=TRUE,sep='\t',quote='')
groupdata <- condition[!duplicated(as.character(condition$sample)),]
rownames(groupdata) <- as.character(groupdata$sample)
sample <- as.character(unique(groupdata$sample))
group <- as.character(unique(groupdata$group))
groupdata$group <- factor(groupdata$group,levels=group)

data <- subset(fpkm,select=sample)
data <- data[rowSums(data)>1,]
data <- na.omit(data)
data <- log2(data+1)
if (type == "pca"){
    data <- scale(data)
    data <- t(scale(t(data),center=TRUE,scale=F))
    pca <- prcomp(t(data),center=FALSE,scale.=FALSE)
    pca_data <- data.frame(pc1=pca$x[,1],pc2=pca$x[,2],sample=colnames(data),group=groupdata[colnames(data),'group'])

    pct <- (pca$sdev^2)/sum(pca$sdev^2)
    pc1 <- sprintf("(%.2f%%)",pct[1]*100)
    pc2 <- sprintf("(%.2f%%)",pct[2]*100)

    p <- ggplot(pca_data,aes(x=pc1,y=pc2,colour=group)) +
        geom_point(size=4) +geom_text_repel(aes(pc1,pc2,label=sample),size=4) +
        labs(x=paste('PC1 ',pc1,sep=''),y=paste('PC2 ',pc2,sep='')) +
        geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') +
        geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') +
        theme_bw() +
        theme(plot.title=element_text(hjust=0.5))
    prefix <- paste0(prefix,'_pca')
    write.table(pca_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F)
}else if (type == "pcoa"){
    data_dist <- vegdist(as.data.frame(t(data)))
    Pcoa <- pcoa(data_dist,correction="none")
    Pcoa_data <- data.frame(pcoa1=Pcoa$vectors[,1], pcoa2=Pcoa$vectors[,2],sample=colnames(data),group=groupdata[colnames(data),'group'])
    pca_data <- Pcoa_data
    colnames(pca_data)[1:2] <- c('pc1','pc2')
    p <- ggplot(Pcoa_data,aes(x=pcoa1,y=pcoa2,colour=group)) +
        geom_point(size=4) +geom_text_repel(aes(pcoa1,pcoa2,label=sample),size=4) +
        labs(x="PCOA1",y="PCOA2") +
        geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') +
        geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') +
        theme_bw() +
        theme(plot.title=element_text(hjust=0.5))
    prefix <- paste0(prefix,'_pcoa')
    write.table(Pcoa_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F)
}else if (type == "nmds"){
    data <- as.data.frame(t(data))
    nmds <- metaMDS(data, distance = 'bray', k = 4)
    nmds_data <- data.frame(NMDS1=nmds$points[,1], NMDS2=nmds$points[,2],sample=rownames(data),group=groupdata[rownames(data),'group'])
    pca_data <- nmds_data
    colnames(pca_data)[1:2] <- c('pc1','pc2')
    p <- ggplot(nmds_data,aes(x=NMDS1,y=NMDS2,colour=group)) +
        geom_point(size=4) +geom_text_repel(aes(NMDS1,NMDS2,label=sample),size=4) +
        labs(x="NMDS1",y="NMDS2") +
        geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') +
        geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') +
        theme_bw() +
        theme(plot.title=element_text(hjust=0.5))
    prefix <- paste0(prefix,'_ndms')
    write.table(nmds_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F)
}else if (type == "tsne"){
    data <- as.matrix(t(data))
    print(dim(data))
    set.seed(0)
    rtsne <- Rtsne(data,pca=FALSE,perplexity=floor((nrow(data)-1)/3),theta=0.0)
    rtsne_data <- data.frame(tsne1 = rtsne$Y[,1],tsne2=rtsne$Y[,2],sample=rownames(data),group=groupdata[rownames(data),'group'])
    pca_data <- rtsne_data
    colnames(pca_data)[1:2] <- c('pc1','pc2')
    p <- ggplot(rtsne_data,aes(x=tsne1,y=tsne2,colour=group)) +
        geom_point(size=4) +geom_text_repel(aes(tsne1,tsne2,label=sample),size=4) +
        labs(x="tsne1",y="tsne2") +
        geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') +
        geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') +
        theme_bw() +
        theme(plot.title=element_text(hjust=0.5))
    prefix <- paste0(prefix,'_tsne')
    write.table(rtsne_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F)
}else if (type == "umap"){
    data <- as.matrix(t(data))
    umap_out <- umap(data,preserve.seed=F,n_neighbors=(nrow(data)-1))
    umap_data <- data.frame(umap1=umap_out$layout[,1],umap2=umap_out$layout[,2],sample=rownames(data),group=groupdata[rownames(data),'group'])
    pca_data <- umap_data
    colnames(pca_data)[1:2] <- c('pc1','pc2')
    p <- ggplot(umap_data,aes(x=umap1,y=umap2,colour=group)) +
        geom_point(size=4) +geom_text_repel(aes(umap1,umap2,label=sample),size=4) +
        labs(x="umap1",y="umap2") +
        geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') +
        geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') +
        theme_bw() +
        theme(plot.title=element_text(hjust=0.5))
    prefix <- paste0(prefix,'_umap')
    write.table(umap_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F)
}else if (type == "plsda"){
    data <- as.matrix(t(data))
    Y <- groupdata$group
    names(Y) <- groupdata$sample
    print(Y)
    Y <- Y[rownames(data)]
    plsda_out <- plsda(data, Y, ncomp = 3)
    plsda_data <- data.frame(comp1=plsda_out$variates$X[,1],comp2=plsda_out$variates$X[,2],sample=rownames(data),group=groupdata[rownames(data),'group'])
    pca_data <- plsda_data
    colnames(pca_data)[1:2] <- c('pc1','pc2')
    Comp1 <- round(plsda_out$explained_variance$X[1] * 100,2)
    Comp2 <- round(plsda_out$explained_variance$X[2] * 100,2)
    p <- ggplot(plsda_data,aes(x=comp1,y=comp2,colour=group)) +
        geom_point(size=4) +geom_text_repel(aes(comp1,comp2,label=sample),size=4) +
        labs(x=paste0("comp1(",Comp1," %)"),y=paste0("comp2(",Comp2," %)")) +
        geom_hline(yintercept=0,linetype='dotdash',size=0.8,color='grey') +
        geom_vline(xintercept=0,linetype='dotdash',size=0.8,color='grey') +
        theme_bw() +
        theme(plot.title=element_text(hjust=0.5))
    prefix <- paste0(prefix,'_plsda')
    write.table(plsda_data,file=paste0(prefix,'.xls'),sep='\t',quote=F,row.names=F)
}else{
    print('type must be one of "pca" , "pcoa" , "ndms" , "tsne" , "umap" , "plsda"')
    q()
}

if(add){
    if(min(table(as.character(groupdata$group))) > 3){
        p <- p + stat_ellipse(aes(x=pc1,y=pc2,colour=group),data=pca_data,level = 0.95,show.legend = FALSE)
    }else{
        p <- p + ggforce::geom_mark_ellipse(aes(x=pc1,y=pc2,colour=group),data=pca_data,show.legend = FALSE)
    }
}
pdf(file=paste(prefix,'.pdf',sep=''))
p
dev.off()

svg(file=paste(prefix,'.svg',sep=''))
p
dev.off()
ggsave(file=paste(prefix,'.png',sep=''),type='cairo-png',plot=p)
### TSNE AND UMAP 降维分析

## 统一流形逼近与投影(UMAP, Uniform Manifold Approximation and Projection)
################################################################################

### load libraries

library(ggplot2)
library(umap)
library(Rtsne)
library(cowplot)
library(vegan)

setwd("/Users/fengjie/Desktop/Research/UMAP_TSNE")
data <- read.table("gene_fpkm.xls",header = T,sep = "\t",row.names = 1)
group <- read.table("condition.xls",header = T,sep = "\t",row.names = 1)
dat <- data[,rownames(group)]
dat <- t(dat)
################################################################################
#UMAP
#methods:euclidean,manhattan,braycurtis etc
set.seed(123)
dist <- vegdist(dat,method = "bray",na.rm=TRUE)
dist <- data.matrix(dist)
## samller N-neighbors, points more clustered
umap <- umap(dist,n_components=2,n_neighbors=8)
umap.score <- umap$layout
umap.score <- as.data.frame(umap.score)
colnames(umap.score) <- c("UMAP1","UMAP2")
umap.score$group = group$group
color <- c( "#3c5488B2","#00A087B2",
            "#F39B7FB2","#91D1C2B2",
            "#8491B4B2","#DC0000B2",
            "#7E6148B2","yellow",
            "darkolivegreen1","lightskyblue",
            "darkgreen","deeppink","khaki2",
            "firebrick","brown1","darksalmon",
            "darkgoldenrod1","darkseagreen","darkorchid")
################################################################################
#plot
pl <- ggplot(umap.score, aes(UMAP1,UMAP2,color=group))+
  theme_classic()+
  geom_vline(xintercept = 0, color = "grey", size = 0.3) +
  geom_hline(yintercept = 0, color = "grey", size = 0.3) +
  geom_point(size = 4)+
  scale_color_manual(values = color[1:length(unique(group$group))]) +
  theme(panel.grid = element_line(color = "grey",linetype = 2,size = 0.1),
    panel.background = element_rect(color = "black", fill = "transparent"),
    legend.title = element_blank(),
    legend.text = element_text(size = 12),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12))+
  labs(x = "UMAP1", y = "UMAP2")

################################################################################
######
## t-SNE(t-distributed stochastic neighbor embedding)T分布随机近邻嵌入
#是用于降维的一种学习算法.
# TSNE
set.seed(123)
## smaller perplexity, more clustered points
tsne.out <- Rtsne(dat,dims=2,pca=FALSE,perplexity = 3)
tsne <- as.data.frame(tsne.out$Y)
colnames(tsne) <- c("tsne1","tsne2")
rownames(tsne) <- rownames(group)
tsne$group <- factor(group$group)
p2 <- ggplot(tsne,aes(tsne1,tsne2,color = group)) +
  theme_classic() +
  geom_point(size=4) +
  scale_color_manual(values = color) +
  geom_vline(xintercept = 0, color = "grey", size = 0.5) +
  geom_hline(yintercept = 0, color = "grey", size = 0.3) +
  theme(legend.text = element_text(size = 12),
        axis.text = element_text(size = 10),
        panel.grid = element_line(color = "grey", linetype = 2, size = 0.1),
        panel.background = element_rect(color = "black", fill = "transparent"),
        legend.title = element_blank()) +
  labs(x="TSNE1",y="TSNE2")

out <- plot_grid(pl,p2,NULL,lables = c("A","B",""),ncol = 2)
pdf("UMAP_TSNE.pdf")
out
dev.off()
个性化条目/tsne与umap聚类分析.txt · 最后更改: 2023/03/15 07:50 由 fengjie