# !/usr/bin/python
# -*—coding=utf-8-*-
# author: lizhengnan
import os
import re
import argparse
parser = argparse.ArgumentParser(description='对wgcna结果中每个模块内的基因进行富集分析')
# 添加参数
parser.add_argument('--wgcna', type=str, help='基因-模块对应关系列表路径, 例如:/TJPROJ6/RNA_SH/shouhou/202308/X101SC23063706-Z01-J001/Result_X101SC22123709-Z01-F002-B2-16/9.WGCNA/3.模块特征及表达模式/3.4模块基因表达模式/基因-模块对应关系列表')
parser.add_argument('--gene', type=str, help='gene.xls')
parser.add_argument('--go', type=str, help='go.xls')
parser.add_argument('--kegg', type=str, help='kegg.xls')
parser.add_argument('--kegg_abbr', type=str, help='kegg物种缩写')
parser.add_argument('--outdir', type=str, help='the outdir')
argv = vars(parser.parse_args())
if argv['wgcna'] is None:
raise Exception('You should provide WGCNA result dir!')
else:
wgcna = argv['wgcna'].strip()
if argv['gene'] is None:
raise Exception('You should provide gene.xls!')
else:
gene = argv['gene'].strip()
if argv['go'] is None:
raise Exception('You should provide go.xls!')
else:
go = argv['go'].strip()
if argv['kegg'] is None:
raise Exception('You should provide the kegg.xls file !')
else:
kegg = argv['kegg'].strip()
if argv['kegg_abbr'] is None:
raise Exception('You should provide the kegg abbr infomation !')
else:
kegg_abbr = argv['kegg_abbr'].strip()
if argv['outdir'] is None:
outdir = './'
else:
outdir = argv['outdir'].strip()
if not os.path.exists(outdir):
os.mkdir(outdir)
script_dir = os.path.join(outdir, 'log')
if not os.path.exists(script_dir):
os.makedirs(script_dir)
def get_module_name(dir):
module_file = os.listdir(dir)
module_name = []
for i in module_file:
name = re.match('^LinkID-(.*).txt$', i).group(1)
module_name.append(name)
# 删除all 和 gery 模块名,这两个模块不做富集分析
module_name.remove('all')
module_name.remove('grey')
return module_name
def enrich_cmd(wgcna, module_name, gene, go, kegg, kegg_abbr, script_dir):
for module in module_name:
script_file = os.path.join(script_dir, module + "_enrich.sh")
cmd = 'export PYTHONPATH=\n'
cmd += 'unset PERL5LIB\n'
cmd += 'export PATH=/TJPROJ2/GB/PUBLIC/software/GB_TR/mRNA/miniconda3/envs/r_3.5.0/bin:$PATH\n\n'
# 添加gene_name
cmd += "sed -i '1igene_id' {}/LinkID-{}.txt\n".format(wgcna, module)
cmd += 'Rscript /TJPROJ6/RNA_SH/personal_dir/lizhengnan/scripts/add_anno.R --files {}/LinkID-{}.txt --anno {}\n\n'.format(wgcna, module, gene)
# GO
go_outdir = os.path.join(outdir, 'Enrichment/1.GO', module)
cmd += 'mkdir -p {}\n\n'.format(go_outdir)
cmd += '/TJPROJ2/GB/PUBLIC/source/GB_TR/mRNA/gb_trans/gb_MedRef_man_pipline/bin/GO --go {} --diffgene {}/LinkID-{}.txt --diffresult {}' \
' --enrich normal --prefix {}/{}\n\n'.format(go, wgcna, module, gene, go_outdir, module)
cmd += '/TJPROJ2/GB/PUBLIC/source/GB_TR/mRNA/gb_trans/gb_MedRef_man_pipline/bin/bar_plot --stat {}/{}_GOenrich.xls' \
' --type GO --cutoff padj --prefix {}/{}_GObar\n'.format(go_outdir, module, go_outdir, module)
cmd += '/TJPROJ2/GB/PUBLIC/source/GB_TR/mRNA/gb_trans/gb_MedRef_man_pipline/bin/dot_plot --stat {}/{}_GOenrich.xls' \
' --type GO --cutoff padj --prefix {}/{}_GOdot\n\n'.format(go_outdir, module,go_outdir, module)
# KEGG
kegg_outdir = os.path.join(outdir, 'Enrichment/2.KEGG', module)
cmd += 'mkdir -p {}\n\n'.format(kegg_outdir)
cmd += '/TJPROJ2/GB/PUBLIC/source/GB_TR/mRNA/gb_trans/gb_MedRef_man_pipline/bin/KEGG --kegg {} --diffgene {}/LinkID-{}.txt --diffresult {} ' \
' --enrich normal --prefix {}/{} --abbr {}\n\n'.format(kegg, wgcna, module, gene, kegg_outdir, module, kegg_abbr)
cmd += '/TJPROJ2/GB/PUBLIC/source/GB_TR/mRNA/gb_trans/gb_MedRef_man_pipline/bin/bar_plot --stat {}/{}_KEGGenrich.xls' \
' --type KEGG --cutoff padj --prefix {}/{}_KEGGbar\n'.format(kegg_outdir, module, kegg_outdir, module)
cmd += '/TJPROJ2/GB/PUBLIC/source/GB_TR/mRNA/gb_trans/gb_MedRef_man_pipline/bin/dot_plot --stat {}/{}_KEGGenrich.xls' \
' --type KEGG --cutoff padj --prefix {}/{}_KEGGdot\n\n'.format(kegg_outdir, module, kegg_outdir, module)
cmd += 'cd {}\n'.format(kegg_outdir)
cmd += 'python2 /TJPROJ6/RNA_SH/personal_dir/lizhengnan/scripts/WGCNA/simple_pathway_html.py --table {}/{}_KEGGenrich.xls' \
' --prefix {}/{} --abbr {}\n'.format(kegg_outdir, module, kegg_outdir, module, kegg_abbr)
cmd += 'mv {}_pathway_detail.html {}_KEGGenrich.html'.format(module, module)
with open(script_file, "w") as f:
f.write(cmd)
module_name = get_module_name(wgcna)
enrich_cmd(wgcna, module_name, gene, go, kegg, kegg_abbr, script_dir)
使用方法:
python3 wgcna_enrich.py --help
usage: wgcna_enrich.py [-h] [--wgcna WGCNA] [--gene GENE] [--go GO]
[--kegg KEGG] [--kegg_abbr KEGG_ABBR] [--outdir OUTDIR]
对wgcna结果中除了all和grey模块, 对其余模块内的基因进行富集分析
optional arguments:
-h, --help show this help message and exit
--wgcna WGCNA 输入WGCNA结果文件路径
--gene GENE gene.xls
--go GO go.xls
--kegg KEGG kegg.xls
--kegg_abbr KEGG_ABBR
kegg物种缩写
--outdir OUTDIR the outdir