目录

简介

单亲二体(uniparental disomy, UPD),指来自父母一方的染色体区域/片段被另一方的同源部分取代,或一个个体的两条同源染色体都来自同一亲体。个体的所有染色体都来源于单亲,在胚胎发育时夭折。如来源于父亲的单亲二倍体发育成葡萄胎,易恶性转化 成绒毛膜肿瘤;来源于母亲的单亲二倍体则形成卵巢畸胎瘤。

功能

该流程用于计算染色体中是否发生单亲二倍体

数据准备

1 sample info 文件 和vcf.gz文件

数据分析

脚本示例 : /TJPROJ2/RESEQ/share/software/vcftools_v0.1.14_step/bin/vcftools \

  1. -gzvcf goose-dp5-miss0.2-maf0.05.vcf.gz \
  2. -TajimaD 20000 \
  3. -step 10000 \
  4. -out goose
#!/usr/bin/python
#coding: utf-8
import os
import sys
import re
import argparse

from argparse import RawTextHelpFormatter
parser = argparse.ArgumentParser(description = 'Function: Dominant, Recessive and Compound heterozygous model analysis.Notice that all gene name in AAChange column must in GeneName column\nContact: yuhuan@novogene.com', formatter_class = RawTextHelpFormatter)
#parser.add_argument('-M', '--model', metavar = 'String', required = True, help = 'Dominant(D)|Recessive(R)|Compound heterozygous(C).')
parser.add_argument('-S', '--samp_info', metavar = 'File', required = True, help = '#FamilyID, SampleID and Normal/Patient must be inclued, samples in the same family must have the same familyID.')
#parser.add_argument('-Fid', '--FamilyID',metavar = 'String', required = True, help = 'in sampleinfo')
parser.add_argument('-I', '--in', metavar = 'File', required = True, help = 'The merged file use to do model analysis with .xls(.gz) format.')
parser.add_argument('-T', '--type',metavar = 'String', required = True, help = 'Mutation type, snp,indel,or both',choices=['snp','indel','snp.indel'])
parser.add_argument('-AC', '--allcase', metavar = 'String', required = False, help = 'Mutations are same in all case, use for sporadic sample, Y for all case must have same mutaion, N for case can be ref. choices=[Y, N], defalut=Y (yes)', choices=['Y', 'N'], default='Y')
parser.add_argument('-O', '--out', metavar = 'file', default  = './', help = 'The output files prefix')
argv = vars(parser.parse_args())

#model     = argv['model'].strip()
samp_info = argv['samp_info'].strip()
infile    = argv['in'].strip()
type      = argv['type'].strip()
out       = argv['out'].strip()
#Fid       = argv['FamilyID'].strip()
allcase   = argv['allcase'].strip()
#define the open file handle
def safe_open(file_name,mode):
        try:
                if not file_name.endswith('.gz'):
                        return open(file_name,mode)
                else:
                        import gzip
                        return gzip.open(file_name,mode)
        except IOError:
                print file_name + ' do not exist!'

def get_het(sample):
    if re.match(r'\d+',sample):
        sample_format = re.search('(\d+)[\/|\|](\d+)',sample)
        if sample_format.group(1) == sample_format.group(2) and sample_format.group(1) != '0':
            return 'hom'
        elif sample_format.group(1) == sample_format.group(2) and sample_format.group(1) == '0':
            return 'ref'
        elif sample_format.group(1) != sample_format.group(2) and \
            (sample_format.group(1) == '0' or sample_format.group(2) == '0'):
            return 'het'
        else:
            return 'chet'  ##compound heterozygous
    else:
        return 'nocall'  

def getpos2(name,title,name2=''):
    ntitle = [i.lower() for i in title]
    if name.lower() in ntitle:
        pos = ntitle.index(name.lower())
    elif name2 != '' and name2.lower() in ntitle:
        pos = ntitle.index(name2.lower())
    else:
        if name2 != '':
            exit('Warning: %s and %s not in title.' %(name,name2))
        else:
            exit('Warning: %s not in title.' %name)
    return pos

def getRatio(formatinfo,dpinfo):
    dpdic = dict(zip(formatinfo,dpinfo))
    if ('DV' in dpdic) and re.match(r'\d+',dpdic['DV']) and re.match(r'\d+',dpdic['DP']) and int(dpdic['DP']) > 0:
        dpratio = float(dpdic['DV'])/int(dpdic['DP'])
    elif ('AD' in dpdic) and  re.match(r'\d+',dpdic['AD']) and re.match(r'\d+',dpdic['DP']) and int(dpdic['DP']) > 0:
        Adp = 0
        for adp in range(1,len(dpdic['AD'].split(','))):
            if dpdic['AD'].split(',')[adp]!=".":
                Adp += float(dpdic['AD'].split(',')[adp])
        dpratio = float(Adp)/int(dpdic['DP'])
    else:
        dpratio = 1
        print "Please check the FORMAT %s %s cloum, if AD/DV and DP in this colum or if DP value larger than 0, dpratio set as 1." %(formatinfo, dpinfo)
    return dpratio,dpdic


def get_upd(gt_proband,gt_father,gt_mother):#gt_proband,gt_father.gt_mother
    tag = []#UN/BI/UA/UI pa/ma
    if get_het(gt_proband) in ['ref','hom']:
        if get_het(gt_father) in ['ref','hom'] and get_het(gt_mother) in ['ref','hom'] and gt_father != gt_mother:
            tag.append('UA')
            if gt_proband == gt_father:
                tag.append('pa')
            elif gt_proband == gt_mother:
                tag.append('ma')
        elif (get_het(gt_father) in ['ref','hom']) and (gt_proband != gt_father) and (get_het(gt_mother) == 'het'):
            tag.append('UI')
            tag.append('ma')
        elif (get_het(gt_mother) in ['ref','hom']) and (gt_proband != gt_mother) and (get_het(gt_father) == 'het'):
            tag.append('UI')
            tag.append('pa')
        else:
            tag.append('UN')
    elif get_het(gt_proband) == 'het':
        if get_het(gt_father) in ['ref','hom'] and get_het(gt_mother) in ['ref','hom'] and gt_father != gt_mother:
            tag.append('BI')
        else:
            tag.append('UN')
    elif (get_het(gt_proband) in ['ref','nocall']) and (get_het(gt_father) in ['ref','nocall']) and (get_het(gt_mother) in ['ref','nocall']):
        tag.append('noVariant')
    else:
        tag.append('UN')
    #print '_'.join(tag)
    return '_'.join(tag)


def main():
    with open (samp_info,'r') as f1, safe_open(infile,'r') as f2, open(out+'.xls','w') as f3, open(out+".stat.xls",'w') as f4:
        famD = {}
        statD = {}#statistic for the UN ,UA, UI, BI, percent
        #get the sample's relation
        print ">>>>>>>>>>Start to get the information ...."
        for i in f1:
            ii = i.strip()
            if ii.startswith("#") and 'familyid' not in ii.lower() :
                continue
            elif ii.lower().startswith('#familyid') and ('pa' in ii.lower()) and ('ma' in ii.lower()):
                ii_h = ii.lower().split('\t')#the header
                #print ii_h
            else:
                iiline = ii.split('\t')
                #print iiline
                if len(iiline) == len(ii_h):#the line contine the proband and the pa,ma
                    famid = iiline[ii_h.index('#familyid')]
                    probandid = iiline[ii_h.index('sampleid')]
                    paid = iiline[ii_h.index('pa')]
                    maid = iiline[ii_h.index('ma')]
                    if not famD.get(famid,False):
                        famD.setdefault(famid,[]).append(probandid)
                        famD.setdefault(famid,[]).append(paid)
                        famD.setdefault(famid,[]).append(maid)
                        #statD[famid] = {'total':0,'UN':0,'UA':0,'UI':0,'BI':0}
                        statD[famid] = {}
                        for chr in [i for i in range(1,22+1)]:
                            statD[famid][str(chr)] = {'total':0,'UN':0,'UA':0,'UI':0,'BI':0}
                    else:
                        print "The family has not the only one proband!Please check! "+ famid + "\t" + probandid
                else:
                    print "The proband %s has not the information of his father and mother. " %iiline[ii_h.index('sampleid')]
        print famD
        print statD 
        print ">>>>>>>>>>>>>>>Start judge:........."
        #judge the variants for infile
        for line in f2:
            iline = line.strip().split('\t')
            #print iline[0]
            if iline[0].startswith("##"):
                continue
            elif iline[0] in ("Priority","#CHROM"):
                headerlist = iline
                print headerlist
                pformat = line.strip().lower().split("\t").index('format')
                pChr = getpos2('#chrom',line.strip().lower().split("\t"),'chrom')
        #        print pChr
                outheader = ['tag','UPDfam'] + headerlist
                f3.write("\t".join(outheader)+"\n")
                for famid in famD:
                    for iisam in famD[famid][0:4]:
#                        famD.setdefault(famid,[]).append(headerlist.index(iisam))##add the index for the sample to get the genotype
                        famD[famid].append(headerlist.index(iisam))
                print famD
                
            else:
                gtIndex = iline[pformat].split(":").index("GT")
                tag = [] #UN,UA,UI,BI
                updfam = [] #only the UA and UI familyID
                chrom = iline[pChr]
                #print chrom
                if chrom not in ('X','Y'):
                    for famid in famD:
                        gt_proband = iline[famD[famid][3]].split(":")[gtIndex]
                        gt_father = iline[famD[famid][4]].split(":")[gtIndex]
                        gt_mother = iline[famD[famid][5]].split(":")[gtIndex]
                        upd = get_upd(gt_proband,gt_father,gt_mother)#UA_pa/ma,UI_pa/ma,BI,UN,noVariant
                 #   print upd+": "+gt_proband+" "+gt_father+" "+gt_mother
                        tag.append(upd+"_"+famid)
                        statD[famid][chrom]['total'] += 1
                        statD[famid][chrom][upd.split("_")[0]] += 1
                        if upd not in ("UN","noVariant","BI"):
                            updfam.append(famid)
                    if updfam != []:
                        outlist = [';'.join(tag),';'.join(updfam)] + iline
                    else:
                        outlist = [';'.join(tag),'--'] + iline
                    f3.write("\t".join(outlist)+"\n")
        print statD
        f4.write("\t".join(["FamID","CHR","Total","UI","UIperc","UA","UAperc","BI","BIperc","UN","UNperc"])+"\n")
        for famid in famD:
            for ichr in [str(i) for i in range(1,22+1)]:
                if statD[famid][ichr]['total'] != 0:
                    uiPerc = statD[famid][ichr]['UI']/float(statD[famid][ichr]['total'])
                    uaPerc = statD[famid][ichr]['UA']/float(statD[famid][ichr]['total'])
                    biPerc = statD[famid][ichr]['BI']/float(statD[famid][ichr]['total'])
                    unPerc = statD[famid][ichr]['UN']/float(statD[famid][ichr]['total'])
                    statout = '%s\t%s\t%d\t%d\t%.4f\t%d\t%.4f\t%d\t%.4f\t%d\t%.4f' %(famid,ichr, statD[famid][ichr]['total'], statD[famid][ichr]['UI'], uiPerc, statD[famid][ichr]['UA'], uaPerc,statD[famid][ichr]['BI'],biPerc,statD[famid][ichr]['UN'],unPerc)
#            f4.write("\t".join(statout)+"\n")
                else:
                    statout = '%s\t%s\t0\t0\t-\t0\t-\t0\t-' %(famid,ichr)
                f4.write(statout+"\n")
if __name__ == '__main__':
    main()                    

python upd.v1.py -S sample_info_f7 -I f7.snp.indel.vcf.gz -T snp.indel -O f7.snp.indel.upd 示例路径

/TJPROJ12/AFS_RESEQ/Proj/hongxiang/WES.guangfuer/AFS/UPD/updbak

步骤: 1、bam截取,截取chr21 目录bam_cut: 投递:run.sh (内部生成do_IGV.sh、IGV_21_xx_xxxx.sh、get.21_xx_xxxx.sh,并运行相应shell),本地IGV截图。

2、根据位点统计 目录:site_genetic_sources W3889样本所在家系处理方法:step1_extract_sample_and_site.sh,结果见F10_snp.from_*.xls 对照家系处理方法:step2_control_extract_sample_and_site.sh,结果见F4control_snp.from_*.xls

3、UPD分析 目录:UPD 投递 run.sh生成 step.control.sh step.target.sh后,依次投递此两个shell 结果见:*upd.stat.xls

分析情况见:chr21父源母源情况查看结果.docx