数据库官网

https://tcdb.org/download.php

脚本

/TJPROJ6/RNA_SH/script_dir/annotation/TCDB/TCDB_blast.py

如果要增加二级注释请使用下面的脚本

import sys,re
import requests
reload(sys)
sys.setdefaultencoding('utf8')

session = requests.session()
#annot_open = open("annot.xls").readlines()
#annot = {}
#for each in annot_open:
#	each_lines = each.strip().split("-")
#	annot[each_lines[0].strip()] = each_lines[1].strip()
def get_subfamily(ids):
	info_url = "https://tcdb.org/getinfo.php?id=" + ids
	headers = {
		"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
	}
	resp = session.post(info_url,headers=headers)
	try:
		if resp.status_code == 200:
			html=resp.json()
			name = html["name"]
			return name
		return None
	except requests.ConnectionError as e:
		print('Error',e.args)


xls_open = open(sys.argv[1]).readlines()
result = {}

for each in xls_open[1:]:
	each_lines = each.strip().split("\t")
	type_ = each_lines[1].split("|")[-1]
	cluster = each_lines[1].split("|")[-2]
	if re.search(r"([0-9].[A-Z,a-z].[0-9]+)",type_):
		each_type = re.search(r"([0-9].[A-Z,a-z].[0-9]+)",type_).group(1)
	else:
		print(each_)
	if each_type not in result.keys():
		result[each_type] = {}
		result[each_type] = {"cluster_number":[],"gene_number":0}
		if cluster not in result[each_type]["cluster_number"]:
			result[each_type]["cluster_number"].append(cluster)
		result[each_type]["gene_number"] = 1
	else:
		if cluster not in result[each_type]["cluster_number"]:
			result[each_type]["cluster_number"].append(cluster)
		result[each_type]["gene_number"] = result[each_type]["gene_number"] + 1
name = sys.argv[1].strip("_tcdb.xls").split("/")[0]
#print("name"+"\t"+"\t".join(result.keys())+"\n"+name+"\t"+"\t".join(str(each) for each in result.values()))

for each_key,each_value in result.items():
	print(name+"\t"+each_key+"\t"+str(len(each_value["cluster_number"]))+"\t"+str(each_value["gene_number"])+"\t"+get_subfamily(each_key))
#	print(get_subfamily(each_key))