项目:P101SC17080321-02-B1[西北师范大学1个异域棱果沙棘三代无参全长转录组+36个沙棘二代转录组测序分析技术服务(委托)合同
Evaluation of methods for differential expression analysis on multi-group RNA-seq count data–TCC包中基于DEGES的流程能有效地对三组数据进行差异分析,其中有少量生物学重复(2-6组生物学重复)的使用edgeR的DEGES流程(EEE-E),没有生物学重复的使用DESeq2的DEGES流程(SSS-S)
方法:即在使用X-Y流程筛选出差异基因之后,去除这些差异基因重新对数据进行标准化,并根据再次标准化后的数据再次筛选差异基因。这个流程可以多次迭代,也就是X-(Y-X)n-Y流程。
如下:
/TJPROJ6/RNA_SH/personal_dir/zhangxin/jiaoben/diff/TCC.R
使用:
/TJPROJ6/RNA_SH/personal_dir/zhangxin/miniconda/envs/R_3.6.0/bin/Rscript /TJPROJ6/RNA_SH/personal_dir/zhangxin/jiaoben/diff/TCC.R –count merged2.readcount –condition group.txt –cpname N_RvsG_RvsS_R –out .
说明:多组合差异分析:使用R包TCC,版本:1.16.0 gene_id:基因id pvalue:统计学差异显著性检验指标 FDR:校正后的pvalue rank:排序值 estimatedDEG:是否为差异基因 1:是 0:否
#!/TJPROJ6/RNA_SH/personal_dir/zhangxin/miniconda/envs/R_3.6.0/bin/Rscript suppressMessages({ library(TCC) library(ROC) library(argparser)}) argv <- arg_parser('') argv <- add_argument(argv,"--count", help="the count file") argv <- add_argument(argv,"--condition", help="the condition file") argv <- add_argument(argv,"--cpname", help="the compare name") argv <- add_argument(argv,"--out", help="the out dir, default:now path") argv <- parse_args(argv) count <- argv$count condition <- argv$condition cpname <- argv$cpname out <- argv$out compares <- strsplit(cpname,'vs')[[1]] condition <- read.delim(condition,header=T,sep='\t') if(is.na(out)){out <- getwd()} if(!dir.exists(out)){dir.create(out)} groupdata <- data.frame() soft <- 'edger' for (i in compares) { groupdata <- rbind(groupdata,condition[which(as.character(condition$group) %in% i),]) if(dim(condition[which(as.character(condition$group) %in% i),])[1] == 1){soft <- 'deseq2'} } group <- as.character(unique(groupdata$group)) groupnames <- group group <- factor(groupdata$group,levels=group) group<-c(group) rownames(groupdata) <- as.character(groupdata$sample) count <- read.delim(count,header=TRUE,sep='\t',row.names=1) countdata <- subset(count,select=rownames(groupdata)) tcc <- new("TCC", countdata, group) tcc <- filterLowCountGenes(tcc) print(paste0("soft is ",soft)) if(soft == 'edger'){ design <- model.matrix(~ as.factor(group)) coef <- 2:length(unique(group)) eeee <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", iteration = 3,design = design, coef = coef) normalized_count <- getNormalizedData(eeee) normalized_count_colname <- colnames(normalized_count) for(i in 1:length(groupnames)){ normalized_count <- cbind(normalized_count,rowMeans(subset(normalized_count,select=as.character(groupdata[which(as.character(groupdata$group)==as.character(groupnames[i])),'sample'])))) normalized_count_colname <- c(normalized_count_colname,as.character(groupnames[i]))} colnames(normalized_count) <- normalized_count_colname DE <- estimateDE(eeee, test.method = "edger",FDR=0.05) }else if(soft == 'deseq2'){ ssss <- calcNormFactors(tcc, norm.method = "deseq2", test.method = "deseq2", iteration = 3) normalized_count <- getNormalizedData(ssss) DE <- estimateDE(ssss, test.method = "deseq2",full = ~ group, reduced = ~ 1,FDR = 0.05) } result <- getResult(DE,sort = TRUE) names(result)[names(result)=="p.value"] <- 'pvalue' names(result)[names(result)=="q.value"] <- 'FDR' alloutSorted <- result[order(result$FDR),] normalized_count <- normalized_count[as.character(alloutSorted[,'gene_id']),] all<- data.frame(gene_id=alloutSorted[,'gene_id'],normalized_count,alloutSorted[,c("pvalue","FDR","rank","estimatedDEG")]) write.table(all, file=paste(out,'/',cpname,"_all.xls",sep=''), sep="\t", quote=F, row.name=F) diff_result <- subset(all,FDR<0.05) write.table(diff_result,file=paste(out,'/',cpname,"_diff.xls",sep=''),, sep="\t", quote=F, row.name=F) cat('gene_id:基因id\nsample 矫正后的readcount值(有生物学重复还包括组的均值)\npvalue:统计学差异显著性检验指标\nFDR:校正后的pvalue\nrank:排序值\nestimatedDEG:是否为差异基因 1:是 0:否\n',file=paste0(out,'/readme.txt'))