======简介====== 单亲二体(uniparental disomy, UPD),指来自父母一方的染色体区域/片段被另一方的同源部分取代,或一个个体的两条同源染色体都来自同一亲体。个体的所有染色体都来源于单亲,在胚胎发育时夭折。如来源于父亲的单亲二倍体发育成葡萄胎,易恶性转化 成绒毛膜肿瘤;来源于母亲的单亲二倍体则形成卵巢畸胎瘤。 ======功能====== 该流程用于计算染色体中是否发生单亲二倍体 ======数据准备====== 1 sample info 文件 和vcf.gz文件 ======数据分析====== 脚本示例 : /TJPROJ2/RESEQ/share/software/vcftools_v0.1.14_step/bin/vcftools \ --gzvcf goose-dp5-miss0.2-maf0.05.vcf.gz \ --TajimaD 20000 \ --step 10000 \ --out goose #!/usr/bin/python #coding: utf-8 import os import sys import re import argparse from argparse import RawTextHelpFormatter parser = argparse.ArgumentParser(description = 'Function: Dominant, Recessive and Compound heterozygous model analysis.Notice that all gene name in AAChange column must in GeneName column\nContact: yuhuan@novogene.com', formatter_class = RawTextHelpFormatter) #parser.add_argument('-M', '--model', metavar = 'String', required = True, help = 'Dominant(D)|Recessive(R)|Compound heterozygous(C).') parser.add_argument('-S', '--samp_info', metavar = 'File', required = True, help = '#FamilyID, SampleID and Normal/Patient must be inclued, samples in the same family must have the same familyID.') #parser.add_argument('-Fid', '--FamilyID',metavar = 'String', required = True, help = 'in sampleinfo') parser.add_argument('-I', '--in', metavar = 'File', required = True, help = 'The merged file use to do model analysis with .xls(.gz) format.') parser.add_argument('-T', '--type',metavar = 'String', required = True, help = 'Mutation type, snp,indel,or both',choices=['snp','indel','snp.indel']) parser.add_argument('-AC', '--allcase', metavar = 'String', required = False, help = 'Mutations are same in all case, use for sporadic sample, Y for all case must have same mutaion, N for case can be ref. choices=[Y, N], defalut=Y (yes)', choices=['Y', 'N'], default='Y') parser.add_argument('-O', '--out', metavar = 'file', default = './', help = 'The output files prefix') argv = vars(parser.parse_args()) #model = argv['model'].strip() samp_info = argv['samp_info'].strip() infile = argv['in'].strip() type = argv['type'].strip() out = argv['out'].strip() #Fid = argv['FamilyID'].strip() allcase = argv['allcase'].strip() #define the open file handle def safe_open(file_name,mode): try: if not file_name.endswith('.gz'): return open(file_name,mode) else: import gzip return gzip.open(file_name,mode) except IOError: print file_name + ' do not exist!' def get_het(sample): if re.match(r'\d+',sample): sample_format = re.search('(\d+)[\/|\|](\d+)',sample) if sample_format.group(1) == sample_format.group(2) and sample_format.group(1) != '0': return 'hom' elif sample_format.group(1) == sample_format.group(2) and sample_format.group(1) == '0': return 'ref' elif sample_format.group(1) != sample_format.group(2) and \ (sample_format.group(1) == '0' or sample_format.group(2) == '0'): return 'het' else: return 'chet' ##compound heterozygous else: return 'nocall' def getpos2(name,title,name2=''): ntitle = [i.lower() for i in title] if name.lower() in ntitle: pos = ntitle.index(name.lower()) elif name2 != '' and name2.lower() in ntitle: pos = ntitle.index(name2.lower()) else: if name2 != '': exit('Warning: %s and %s not in title.' %(name,name2)) else: exit('Warning: %s not in title.' %name) return pos def getRatio(formatinfo,dpinfo): dpdic = dict(zip(formatinfo,dpinfo)) if ('DV' in dpdic) and re.match(r'\d+',dpdic['DV']) and re.match(r'\d+',dpdic['DP']) and int(dpdic['DP']) > 0: dpratio = float(dpdic['DV'])/int(dpdic['DP']) elif ('AD' in dpdic) and re.match(r'\d+',dpdic['AD']) and re.match(r'\d+',dpdic['DP']) and int(dpdic['DP']) > 0: Adp = 0 for adp in range(1,len(dpdic['AD'].split(','))): if dpdic['AD'].split(',')[adp]!=".": Adp += float(dpdic['AD'].split(',')[adp]) dpratio = float(Adp)/int(dpdic['DP']) else: dpratio = 1 print "Please check the FORMAT %s %s cloum, if AD/DV and DP in this colum or if DP value larger than 0, dpratio set as 1." %(formatinfo, dpinfo) return dpratio,dpdic def get_upd(gt_proband,gt_father,gt_mother):#gt_proband,gt_father.gt_mother tag = []#UN/BI/UA/UI pa/ma if get_het(gt_proband) in ['ref','hom']: if get_het(gt_father) in ['ref','hom'] and get_het(gt_mother) in ['ref','hom'] and gt_father != gt_mother: tag.append('UA') if gt_proband == gt_father: tag.append('pa') elif gt_proband == gt_mother: tag.append('ma') elif (get_het(gt_father) in ['ref','hom']) and (gt_proband != gt_father) and (get_het(gt_mother) == 'het'): tag.append('UI') tag.append('ma') elif (get_het(gt_mother) in ['ref','hom']) and (gt_proband != gt_mother) and (get_het(gt_father) == 'het'): tag.append('UI') tag.append('pa') else: tag.append('UN') elif get_het(gt_proband) == 'het': if get_het(gt_father) in ['ref','hom'] and get_het(gt_mother) in ['ref','hom'] and gt_father != gt_mother: tag.append('BI') else: tag.append('UN') elif (get_het(gt_proband) in ['ref','nocall']) and (get_het(gt_father) in ['ref','nocall']) and (get_het(gt_mother) in ['ref','nocall']): tag.append('noVariant') else: tag.append('UN') #print '_'.join(tag) return '_'.join(tag) def main(): with open (samp_info,'r') as f1, safe_open(infile,'r') as f2, open(out+'.xls','w') as f3, open(out+".stat.xls",'w') as f4: famD = {} statD = {}#statistic for the UN ,UA, UI, BI, percent #get the sample's relation print ">>>>>>>>>>Start to get the information ...." for i in f1: ii = i.strip() if ii.startswith("#") and 'familyid' not in ii.lower() : continue elif ii.lower().startswith('#familyid') and ('pa' in ii.lower()) and ('ma' in ii.lower()): ii_h = ii.lower().split('\t')#the header #print ii_h else: iiline = ii.split('\t') #print iiline if len(iiline) == len(ii_h):#the line contine the proband and the pa,ma famid = iiline[ii_h.index('#familyid')] probandid = iiline[ii_h.index('sampleid')] paid = iiline[ii_h.index('pa')] maid = iiline[ii_h.index('ma')] if not famD.get(famid,False): famD.setdefault(famid,[]).append(probandid) famD.setdefault(famid,[]).append(paid) famD.setdefault(famid,[]).append(maid) #statD[famid] = {'total':0,'UN':0,'UA':0,'UI':0,'BI':0} statD[famid] = {} for chr in [i for i in range(1,22+1)]: statD[famid][str(chr)] = {'total':0,'UN':0,'UA':0,'UI':0,'BI':0} else: print "The family has not the only one proband!Please check! "+ famid + "\t" + probandid else: print "The proband %s has not the information of his father and mother. " %iiline[ii_h.index('sampleid')] print famD print statD print ">>>>>>>>>>>>>>>Start judge:........." #judge the variants for infile for line in f2: iline = line.strip().split('\t') #print iline[0] if iline[0].startswith("##"): continue elif iline[0] in ("Priority","#CHROM"): headerlist = iline print headerlist pformat = line.strip().lower().split("\t").index('format') pChr = getpos2('#chrom',line.strip().lower().split("\t"),'chrom') # print pChr outheader = ['tag','UPDfam'] + headerlist f3.write("\t".join(outheader)+"\n") for famid in famD: for iisam in famD[famid][0:4]: # famD.setdefault(famid,[]).append(headerlist.index(iisam))##add the index for the sample to get the genotype famD[famid].append(headerlist.index(iisam)) print famD else: gtIndex = iline[pformat].split(":").index("GT") tag = [] #UN,UA,UI,BI updfam = [] #only the UA and UI familyID chrom = iline[pChr] #print chrom if chrom not in ('X','Y'): for famid in famD: gt_proband = iline[famD[famid][3]].split(":")[gtIndex] gt_father = iline[famD[famid][4]].split(":")[gtIndex] gt_mother = iline[famD[famid][5]].split(":")[gtIndex] upd = get_upd(gt_proband,gt_father,gt_mother)#UA_pa/ma,UI_pa/ma,BI,UN,noVariant # print upd+": "+gt_proband+" "+gt_father+" "+gt_mother tag.append(upd+"_"+famid) statD[famid][chrom]['total'] += 1 statD[famid][chrom][upd.split("_")[0]] += 1 if upd not in ("UN","noVariant","BI"): updfam.append(famid) if updfam != []: outlist = [';'.join(tag),';'.join(updfam)] + iline else: outlist = [';'.join(tag),'--'] + iline f3.write("\t".join(outlist)+"\n") print statD f4.write("\t".join(["FamID","CHR","Total","UI","UIperc","UA","UAperc","BI","BIperc","UN","UNperc"])+"\n") for famid in famD: for ichr in [str(i) for i in range(1,22+1)]: if statD[famid][ichr]['total'] != 0: uiPerc = statD[famid][ichr]['UI']/float(statD[famid][ichr]['total']) uaPerc = statD[famid][ichr]['UA']/float(statD[famid][ichr]['total']) biPerc = statD[famid][ichr]['BI']/float(statD[famid][ichr]['total']) unPerc = statD[famid][ichr]['UN']/float(statD[famid][ichr]['total']) statout = '%s\t%s\t%d\t%d\t%.4f\t%d\t%.4f\t%d\t%.4f\t%d\t%.4f' %(famid,ichr, statD[famid][ichr]['total'], statD[famid][ichr]['UI'], uiPerc, statD[famid][ichr]['UA'], uaPerc,statD[famid][ichr]['BI'],biPerc,statD[famid][ichr]['UN'],unPerc) # f4.write("\t".join(statout)+"\n") else: statout = '%s\t%s\t0\t0\t-\t0\t-\t0\t-' %(famid,ichr) f4.write(statout+"\n") if __name__ == '__main__': main() python upd.v1.py -S sample_info_f7 -I f7.snp.indel.vcf.gz -T snp.indel -O f7.snp.indel.upd 示例路径 /TJPROJ12/AFS_RESEQ/Proj/hongxiang/WES.guangfuer/AFS/UPD/updbak 步骤: 1、bam截取,截取chr21 目录bam_cut: 投递:run.sh (内部生成do_IGV.sh、IGV_21_xx_xxxx.sh、get.21_xx_xxxx.sh,并运行相应shell),本地IGV截图。 2、根据位点统计 目录:site_genetic_sources W3889样本所在家系处理方法:step1_extract_sample_and_site.sh,结果见F10_snp.from_*.xls 对照家系处理方法:step2_control_extract_sample_and_site.sh,结果见F4control_snp.from_*.xls 3、UPD分析 目录:UPD 投递 run.sh生成 step.control.sh step.target.sh后,依次投递此两个shell 结果见:*upd.stat.xls 分析情况见:chr21父源母源情况查看结果.docx