======TCC 多组合差异分析======
项目:P101SC17080321-02-B1[西北师范大学1个异域棱果沙棘三代无参全长转录组+36个沙棘二代转录组测序分析技术服务(委托)合同 \\
Evaluation of methods for differential expression analysis on multi-group RNA-seq count data--TCC包中基于DEGES的流程能有效地对三组数据进行差异分析,其中有少量生物学重复(2-6组生物学重复)的使用edgeR的DEGES流程(EEE-E),没有生物学重复的使用DESeq2的DEGES流程(SSS-S) \\
方法:即在使用X-Y流程筛选出差异基因之后,去除这些差异基因重新对数据进行标准化,并根据再次标准化后的数据再次筛选差异基因。这个流程可以多次迭代,也就是X-(Y-X)n-Y流程。\\
如下:
{{:售后:pasted:zhangcuijie_20180703154536.png}} \\
===脚本===
/TJPROJ6/RNA_SH/personal_dir/zhangxin/jiaoben/diff/TCC.R \\
使用:
/TJPROJ6/RNA_SH/personal_dir/zhangxin/miniconda/envs/R_3.6.0/bin/Rscript /TJPROJ6/RNA_SH/personal_dir/zhangxin/jiaoben/diff/TCC.R --count merged2.readcount --condition group.txt --cpname N_RvsG_RvsS_R --out .\\
===结果===
{{:售后:pasted:zhangcuijie_20180703155413.png}} \\
说明:多组合差异分析:使用R包TCC,版本:1.16.0
gene_id:基因id
pvalue:统计学差异显著性检验指标
FDR:校正后的pvalue
rank:排序值
estimatedDEG:是否为差异基因 1:是 0:否
===代码===
#!/TJPROJ6/RNA_SH/personal_dir/zhangxin/miniconda/envs/R_3.6.0/bin/Rscript
suppressMessages({
library(TCC)
library(ROC)
library(argparser)})
argv <- arg_parser('')
argv <- add_argument(argv,"--count", help="the count file")
argv <- add_argument(argv,"--condition", help="the condition file")
argv <- add_argument(argv,"--cpname", help="the compare name")
argv <- add_argument(argv,"--out", help="the out dir, default:now path")
argv <- parse_args(argv)
count <- argv$count
condition <- argv$condition
cpname <- argv$cpname
out <- argv$out
compares <- strsplit(cpname,'vs')[[1]]
condition <- read.delim(condition,header=T,sep='\t')
if(is.na(out)){out <- getwd()}
if(!dir.exists(out)){dir.create(out)}
groupdata <- data.frame()
soft <- 'edger'
for (i in compares) {
groupdata <- rbind(groupdata,condition[which(as.character(condition$group) %in% i),])
if(dim(condition[which(as.character(condition$group) %in% i),])[1] == 1){soft <- 'deseq2'}
}
group <- as.character(unique(groupdata$group))
groupnames <- group
group <- factor(groupdata$group,levels=group)
group<-c(group)
rownames(groupdata) <- as.character(groupdata$sample)
count <- read.delim(count,header=TRUE,sep='\t',row.names=1)
countdata <- subset(count,select=rownames(groupdata))
tcc <- new("TCC", countdata, group)
tcc <- filterLowCountGenes(tcc)
print(paste0("soft is ",soft))
if(soft == 'edger'){
design <- model.matrix(~ as.factor(group))
coef <- 2:length(unique(group))
eeee <- calcNormFactors(tcc, norm.method = "tmm", test.method = "edger", iteration = 3,design = design, coef = coef)
normalized_count <- getNormalizedData(eeee)
normalized_count_colname <- colnames(normalized_count)
for(i in 1:length(groupnames)){
normalized_count <- cbind(normalized_count,rowMeans(subset(normalized_count,select=as.character(groupdata[which(as.character(groupdata$group)==as.character(groupnames[i])),'sample']))))
normalized_count_colname <- c(normalized_count_colname,as.character(groupnames[i]))}
colnames(normalized_count) <- normalized_count_colname
DE <- estimateDE(eeee, test.method = "edger",FDR=0.05)
}else if(soft == 'deseq2'){
ssss <- calcNormFactors(tcc, norm.method = "deseq2", test.method = "deseq2", iteration = 3)
normalized_count <- getNormalizedData(ssss)
DE <- estimateDE(ssss, test.method = "deseq2",full = ~ group, reduced = ~ 1,FDR = 0.05)
}
result <- getResult(DE,sort = TRUE)
names(result)[names(result)=="p.value"] <- 'pvalue'
names(result)[names(result)=="q.value"] <- 'FDR'
alloutSorted <- result[order(result$FDR),]
normalized_count <- normalized_count[as.character(alloutSorted[,'gene_id']),]
all<- data.frame(gene_id=alloutSorted[,'gene_id'],normalized_count,alloutSorted[,c("pvalue","FDR","rank","estimatedDEG")])
write.table(all, file=paste(out,'/',cpname,"_all.xls",sep=''), sep="\t", quote=F, row.name=F)
diff_result <- subset(all,FDR<0.05)
write.table(diff_result,file=paste(out,'/',cpname,"_diff.xls",sep=''),, sep="\t", quote=F, row.name=F)
cat('gene_id:基因id\nsample 矫正后的readcount值(有生物学重复还包括组的均值)\npvalue:统计学差异显著性检验指标\nFDR:校正后的pvalue\nrank:排序值\nestimatedDEG:是否为差异基因 1:是 0:否\n',file=paste0(out,'/readme.txt'))