数据库官网
https://tcdb.org/download.php
脚本
/TJPROJ6/RNA_SH/script_dir/annotation/TCDB/TCDB_blast.py
如果要增加二级注释请使用下面的脚本
import sys,re import requests reload(sys) sys.setdefaultencoding('utf8') session = requests.session() #annot_open = open("annot.xls").readlines() #annot = {} #for each in annot_open: # each_lines = each.strip().split("-") # annot[each_lines[0].strip()] = each_lines[1].strip() def get_subfamily(ids): info_url = "https://tcdb.org/getinfo.php?id=" + ids headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36" } resp = session.post(info_url,headers=headers) try: if resp.status_code == 200: html=resp.json() name = html["name"] return name return None except requests.ConnectionError as e: print('Error',e.args) xls_open = open(sys.argv[1]).readlines() result = {} for each in xls_open[1:]: each_lines = each.strip().split("\t") type_ = each_lines[1].split("|")[-1] cluster = each_lines[1].split("|")[-2] if re.search(r"([0-9].[A-Z,a-z].[0-9]+)",type_): each_type = re.search(r"([0-9].[A-Z,a-z].[0-9]+)",type_).group(1) else: print(each_) if each_type not in result.keys(): result[each_type] = {} result[each_type] = {"cluster_number":[],"gene_number":0} if cluster not in result[each_type]["cluster_number"]: result[each_type]["cluster_number"].append(cluster) result[each_type]["gene_number"] = 1 else: if cluster not in result[each_type]["cluster_number"]: result[each_type]["cluster_number"].append(cluster) result[each_type]["gene_number"] = result[each_type]["gene_number"] + 1 name = sys.argv[1].strip("_tcdb.xls").split("/")[0] #print("name"+"\t"+"\t".join(result.keys())+"\n"+name+"\t"+"\t".join(str(each) for each in result.values())) for each_key,each_value in result.items(): print(name+"\t"+each_key+"\t"+str(len(each_value["cluster_number"]))+"\t"+str(each_value["gene_number"])+"\t"+get_subfamily(each_key)) # print(get_subfamily(each_key))