TCC 多组合差异分析

项目:P101SC17080321-02-B1[西北师范大学1个异域棱果沙棘三代无参全长转录组+36个沙棘二代转录组测序分析技术服务(委托)合同

Evaluation of methods for differential expression analysis on multi-group RNA-seq count data–TCC包中基于DEGES的流程能有效地对三组数据进行差异分析,其中有少量生物学重复(2-6组生物学重复)的使用edgeR的DEGES流程(EEE-E),没有生物学重复的使用DESeq2的DEGES流程(SSS-S)

方法:即在使用X-Y流程筛选出差异基因之后,去除这些差异基因重新对数据进行标准化,并根据再次标准化后的数据再次筛选差异基因。这个流程可以多次迭代,也就是X-(Y-X)n-Y流程。
如下:

脚本

/TJPROJ6/RNA_SH/personal_dir/zhangxin/jiaoben/diff/TCC.R

使用: /TJPROJ6/RNA_SH/personal_dir/zhangxin/miniconda/envs/R_3.6.0/bin/Rscript /TJPROJ6/RNA_SH/personal_dir/zhangxin/jiaoben/diff/TCC.R –count merged2.readcount –condition group.txt –cpname N_RvsG_RvsS_R –out .

结果


说明:多组合差异分析:使用R包TCC,版本:1.16.0
gene_id:基因id
pvalue:统计学差异显著性检验指标
FDR:校正后的pvalue
rank:排序值
estimatedDEG:是否为差异基因 1:是 0:否

代码

#!/TJPROJ6/RNA_SH/personal_dir/zhangxin/miniconda/envs/R_3.6.0/bin/Rscript

suppressMessages({
library(TCC)
library(ROC)
library(argparser)})
argv <- arg_parser('')
argv <- add_argument(argv,"--count", help="the count file")
argv <- add_argument(argv,"--condition", help="the condition file")
argv <- add_argument(argv,"--cpname", help="the compare name")
argv <- add_argument(argv,"--out", help="the out dir, default:now path")
argv <- parse_args(argv)
count <- argv$count
condition <- argv$condition
cpname <- argv$cpname
out <- argv$out

compares <- strsplit(cpname,'vs')[[1]]
condition <- read.delim(condition,header=T,sep='\t')
if(is.na(out)){out <- getwd()}
if(!dir.exists(out)){dir.create(out)}
groupdata <- data.frame()
soft <- 'edger'
for (i in compares) {
    groupdata <- rbind(groupdata,condition[which(as.character(condition$group) %in% i),])
    if(dim(condition[which(as.character(condition$group) %in% i),])[1] == 1){soft <- 'deseq2'}
}
group <- as.character(unique(groupdata$group))
groupnames <- group
group <- factor(groupdata$group,levels=group)
group<-c(group)
rownames(groupdata) <- as.character(groupdata$sample)

count <- read.delim(count,header=TRUE,sep='\t',row.names=1)
countdata <- subset(count,select=rownames(groupdata))
tcc <- new("TCC", countdata, group)
tcc <- filterLowCountGenes(tcc)
print(paste0("soft is ",soft))
if(soft == 'edger'){
    design <- model.matrix(~ as.factor(group))
    coef <- 2:length(unique(group))
    eeee <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", iteration = 3,design = design, coef = coef)
    normalized_count <- getNormalizedData(eeee)
    normalized_count_colname <- colnames(normalized_count)
    for(i in 1:length(groupnames)){
        normalized_count <- cbind(normalized_count,rowMeans(subset(normalized_count,select=as.character(groupdata[which(as.character(groupdata$group)==as.character(groupnames[i])),'sample']))))
        normalized_count_colname <- c(normalized_count_colname,as.character(groupnames[i]))}
    colnames(normalized_count) <- normalized_count_colname
    DE <- estimateDE(eeee, test.method = "edger",FDR=0.05)
}else if(soft == 'deseq2'){
    ssss <- calcNormFactors(tcc, norm.method = "deseq2", test.method = "deseq2", iteration = 3)
    normalized_count <- getNormalizedData(ssss)
    DE <- estimateDE(ssss, test.method = "deseq2",full = ~ group, reduced = ~ 1,FDR = 0.05)
}
result <- getResult(DE,sort = TRUE)
names(result)[names(result)=="p.value"] <- 'pvalue'
names(result)[names(result)=="q.value"] <- 'FDR'
alloutSorted <- result[order(result$FDR),]
normalized_count <- normalized_count[as.character(alloutSorted[,'gene_id']),]
all<- data.frame(gene_id=alloutSorted[,'gene_id'],normalized_count,alloutSorted[,c("pvalue","FDR","rank","estimatedDEG")])
write.table(all, file=paste(out,'/',cpname,"_all.xls",sep=''), sep="\t", quote=F, row.name=F)
diff_result <- subset(all,FDR<0.05)
write.table(diff_result,file=paste(out,'/',cpname,"_diff.xls",sep=''),, sep="\t", quote=F, row.name=F)
cat('gene_id:基因id\nsample 矫正后的readcount值(有生物学重复还包括组的均值)\npvalue:统计学差异显著性检验指标\nFDR:校正后的pvalue\nrank:排序值\nestimatedDEG:是否为差异基因 1:是 0:否\n',file=paste0(out,'/readme.txt'))