用户工具

站点工具


个性化条目:差异基因转录因子结合位点预测

tfbstools软件转录因子结合位点预测

可用于转录组差异基因转录因子结合位点预测分析,并可以结合SNP结果, 对预测结果文件进行注释。此脚本根据lnc TFBS分析结合转录组分析内容串写。

背景介绍

转录因子结合位点(TFBS)指转录因子(TF)在DNA上的结合位点或区域。通常TFBS相当短,长度为4-30个碱基对,但通常位于50-200bp的较大顺式调控区域中,并且在不同
基因中重复或基因内有几次重复; 转录因子与DNA结合的部分是保守的,但通常相当小(3-5bp);准确预测TFBS对于分析基因的转录模式有着重要意义。

方法介绍

用 TFBSTools(1.18.0) 进行差异转录本的TFBS预测分析;
TFBSTools(1.18.0)是分析和处理转录因子结合位点的包,包括位置频率矩阵、位置重量矩阵和信息含量矩阵的矩阵转换,也可以通过序列/对齐、查询JASPAR数据库(2018)推测
潜在的DNA上TFBS位点;
JASPAR数据库:是保存核苷酸谱的PFM的集合,最广泛使用的是JASPAR CORE集合,它是基于试验证据的多细胞真核生物的TFBS谱的非冗余集合。
本项目中的TFBS分析过程主要包含以下步骤:
1). 取差异gene的前2Kb的DNA序列作为该gene的promoter区域序列;
2). 采用TFBSTools(1.18.0)软件, 以及 JASPAR数据库(2018) , 对gene 的promoter区域序列进行TFBS预测; (当该物种在JASPAR数据库中包含物种本身的TFBS位点时,
采用本物种的模型进行预测, 当该物种在JASPAR数据库中无本物种TFBS位点信息时, 采用“all”即全部TFBS进行预测;)
3)基因promoter区域的TFBS的遗传变异可能会导致转录本表达的差异;会结合SNP结果, 对TFBS文件进行注释。

分析方法

python /TJPROJ6/RNA_SH/personal_dir/songxiaoxi/songxiaoxi/shouhou_dir/TFBS/runtfbs.py \

  1. -diffgene /BJPROJ/RNA/shouhou/201807/P101SC18081445/20181019_tfbs/ceshi6/Union_for_cluster_150raw.xls \
  2. -species Homo_sapiens \
  3. -out_dir /BJPROJ/RNA/shouhou/201807/P101SC18081445/20181019_tfbs/ceshi6 \
  4. -fa /BJPROJ/RNA/shouhou/201807/P101SC18081445/20181019_tfbs/ceshi6/genome.fasta \
  5. -gtf /BJPROJ/RNA/shouhou/201807/P101SC18081445/20181019_tfbs/ceshi6/genome.gtf \
  6. -snp /BJPROJ/RNA/project_2018/ref_201808/P101SC17050018-01-B5-16_ren_ref_20180817/P101SC17050018-01-B5-16_TR_result/P101SC17050018-01-B5-16_results/6.SNP/SNPs.xls \
  结果文件解读为: 
  *.TFBS.xls :  seqnames: gene的promoter区域序列id(命名方式见上述文件)
                source: TFBS
                feature: TFBS
                start: TFBS位点相对gene promoter起点的位点坐标;
                end: TFBS位点相对gene promoter终点的位点坐标;
                absScore: TFBS位点得分值; 
                relScore: TFBS位点与gene promoter区域序列的相关性; 收集 相关性>80%的结果;
                strand: TFBS在正负链上的位置
                ID: TFBS id; 查询(http://jaspar.genereg.net/) 数据库获得更多注释;
                TF: 相关转录调控因子;
                class: 相关转录调控因子所属类别;
                siteSeqs: TFBS位点序列信息;
               
  • .TFBS.SNP.xls;


可参考售后路径:/BJPROJ/RNA/shouhou/201807/P101SC18081445/20181019_tfbs/ceshi6

脚本展示

#coding=utf-8\\

import sys\\
import os\\
import ConfigParser\\
LIB_LNCRNA = '/BJPROJ/RNA/shouhou/personal_dir/songxiaoxi/lib1/'\\
sys.path.insert(0, LIB_LNCRNA)\\
import lncRNA\\
import circ\\
import lncRNA_var_defs as var\\
import random\\
import glob\\
import gzip\\
import re\\
import linecache\\
import time\\
import json\\
import subprocess\\
reload(sys)\\
root_dir=os.getcwd()\\
sys.setdefaultencoding('utf-8')\\
from django.template import Template, Context, loader\\
from django.conf import settings\\
import argparse\\

parser = argparse.ArgumentParser(description = 'get TFBS in gene promoter seq')\\
parser.add_argument('--diffgene', help = 'diff_gene file', required = True)\\
parser.add_argument('--species', help = 'species', required = True)\\
#parser.add_argument('--type', help = 'seq file', required = True)\\
parser.add_argument('--n',help="split line ,default=100",default='100')\\
parser.add_argument('--fa',help='geneme_fa',required = True)\\
parser.add_argument('--gtf',help='geneme_gtf',required = True)\\
parser.add_argument('--out_dir', help = 'out_dir', required = True)\\
parser.add_argument('--snp', help = 'snp_annot')\\
argv = vars(parser.parse_args())\\
#type = argv['type'].strip()\\
species = argv['species'].strip()\\
#seq_file = argv['seq_file'].strip()\\
if argv['n'].strip() != "":\\
n=int(argv['n'].strip())\\
elif argv['n'].strip() == "":\\
n=int(100)\\
out_dir = argv['out_dir'].strip()\\
diffgene =argv['diffgene'].strip()\\
fa = argv['fa'].strip()\\
gtf = argv['gtf'].strip()\\
if argv['snp']==None:\\
snp=None\\
else:\\
snp=argv['snp'].strip()\\
###dir###\\
tmp=out_dir + '/tmp'\\
results=out_dir + '/results'\\
os.system('mkdir -p %s'%(tmp))\\
os.system('mkdir -p %s'%(results))\\
####获得差异基因所在gtf文件####\\
os.system('python /BJPROJ/RNA/shouhou/201807/P101SC18081445/20181019_tfbs/get_diff_gtf.py %s %s %s/diff_genome_gtf'%(gtf,diffgene,tmp))\\
####获得差异基因启动子序列###\\
os.system('perl /PUBLIC/source/RNA/lncRNA/pipeline/version6/scripts/TFBS/get_gene_promoter.pl  %s/diff_genome_gtf %s %s/mRNA_TFBS_seq.xls'%(tmp,fa,tmp))\\
seq_file=tmp + '/' +'mRNA_TFBS_seq.xls'\\
###选择物种进行转录因子预测====\\
os.system('mkdir -p %s/split'%(out_dir))\\
os.system('mkdir -p %s/logs'%(out_dir))\\
log_out_dir = out_dir+"/"+"/logs/"\\
os.system('python /PUBLIC/source/RNA/lncRNA/pipeline/version6/scripts/TFBS/split_by_row_v2.py %s/mRNA_TFBS_seq.xls %s %s/split'%(tmp,n,out_dir))\\
each_split_number = lncRNA.get_split_exp_number(seq_file,n)\\
species_list = []\\
for each in open("/PUBLIC/source/RNA/lncRNA/pipeline/version6/scripts/TFBS/JASPAR.species.list","r"):\\
species_list.append("_".join(each.strip().split(" ")))\\
print species\\
print species_list\\
## calculate correlation of lncRNA(TUCP) and mRNA for each split file\\
split_count_generator = xrange(1,(each_split_number+1))\\
TFBS_split_scripts = []\\
for each in split_count_generator:\\
each_split_TFBS_sh=open( log_out_dir +"split_TFBS_"+str(each) + ".sh" ,"w")\\
each_split_TFBS_sh.write('LD_LIBRARY_PATH=/PUBLIC/source/RNA/lncRNA/softwares/gsl/gsl-2.3/usr/local/lib\n'.format(**locals()))\\
each_split_TFBS_sh.write('export LD_LIBRARY_PATH\n'.format(**locals()))\\
if species in species_list:\\
each_split_TFBS_sh.write('/PUBLIC/source/RNA/lncRNA/softwares/R/Rscript /PUBLIC/source/RNA/lncRNA/pipeline/version6/scripts/TFBS/run_TFBS.R --input_file {out_dir}/split/mRNA_TFBS_seq_split_{each}.xls --species {species} --out {out_dir}/split/split_{each}\n'.format(**locals()))\\
else:\\
each_split_TFBS_sh.write\\
('/PUBLIC/source/RNA/lncRNA/softwares/R/Rscript /PUBLIC/source/RNA/lncRNA/pipeline/version6/scripts/TFBS/run_TFBS_no_species.R --input_file {out_dir}/split/mRNA_TFBS_seq_split_{each}.xls  --out {out_dir}/split/split_{each}\n'.format(**locals()))
each_split_TFBS_sh.write('python /PUBLIC/source/RNA/lncRNA/pipeline/version6/scripts/TFBS/run_TFBS_anno.py {out_dir}/split/split_{each}_NA_TFBS.xls\n'.format(**locals()))\\
each_split_TFBS_sh.close()\\
cmd='cat %s/split/split_*_TFBS.xls >%s/TFBS.xls\n' %(out_dir,results)\\
cmd+='cat %s/split/split_*TFBS.gff2 >%s/TFBS.gff2\n'%(out_dir,tmp)\\
cmd+='perl /BJPROJ/RNA/shouhou/201807/P101SC18081445/20181019_tfbs/add_snp_to_tfbs.pl {results}/TFBS.xls {snp}  {results}/TFBS.SNP.xls\n'.format(**locals())\\
cmd+='cp /BJPROJ/RNA/shouhou/personal_dir/wangbaojian/TFBS/TFBS_readme.txt %s/'%(results)\\
open(log_out_dir+"result.sh","w").write(cmd)\\

f_sjm=open(log_out_dir +"analysis.JOB", "w")\\
for i in range(1,each_split_number+1):\\
f_sjm.write("job_begin\n")\\
f_sjm.write("    name " + "tfbs_" + str(i) + "\n")\\
f_sjm.write("    sched_options -cwd -V -l vf=2g,p=2\n")\\
f_sjm.write("    cmd sh  "  +log_out_dir + "split_TFBS_"+str(i) + ".sh\n")\\
f_sjm.write("job_end\n")\\
f_sjm.write("\n")\\
f_sjm.write("job_begin\n")\\
f_sjm.write("    name result\n")\\
f_sjm.write("    sched_options -cwd -V -l vf=1g,p=1\n")\\
f_sjm.write("    cmd sh " + log_out_dir+"result.sh\n")\\
f_sjm.write("job_end\n")\\
###oder###\\
for i in range(1,each_split_number+1):\\
sjm.write("order result after " + "tfbs_" + str(i)+"\n")\\
f_sjm.write('\nlog_dir %s'%(log_out_dir))\\
f_sjm.close()\\
open(root_dir+'/sjm_Analysis.sh','w').write('/PUBLIC/software/public/System/sjm-1.2.0/bin/sjm %s \n' %(log_out_dir +'analysis.JOB'))\\
#assert not os.system('chmod +x %s' % (root_dir+'/sjm_Analysis.sh '))\\
'''
run_TFBS_script_step2 = []\\
run_seqLogo_script_sh = out_dir+"/"+"run_seqLogo.sh"\\
with open(run_seqLogo_script_sh,"w") as run_seqLogo_script:\\
run_seqLogo_script.write('LD_LIBRARY_PATH=/PUBLIC/source/RNA/lncRNA/softwares/gsl/gsl-2.3/usr/local/lib\n')\\
run_seqLogo_script.write('export LD_LIBRARY_PATH\n')\\
run_seqLogo_script.write('/PUBLIC/source/RNA/lncRNA/softwares/R/Rscript /PUBLIC/source/RNA/lncRNA/pipeline/version6/scripts/TFBS/TFBS_Logo.R --input_file %s/TFBS.example.xls --out_dir %s'%(out_dir,out_dir))\\
run_TFBS_script_step2.append(run_seqLogo_script_sh)\\

run_SNP_anno_script_sh = out_dir+"/"+"run_SNP_anno.sh"\\
with open(run_SNP_anno_script_sh,"w") as run_SNP_anno_script:\\
run_SNP_anno_script.write('perl /PUBLIC/source/RNA/lncRNA/pipeline/version6/scripts/TFBS/add_snp_to_tfbs.pl %s/TFBS.xls %s %s/TFBS.SNP.xls\n'%(out_dir,SNP_file,out_dir))\\
run_TFBS_script_step2.append(run_SNP_anno_script_sh)\\
'''
#circ.launch_jobs(mem='3G', p=2, fns=run_TFBS_script_step2,error_dir=log_out_dir)
个性化条目/差异基因转录因子结合位点预测.txt · 最后更改: 2023/03/08 07:53 由 fengjie