=====biomart添加同源基因===== 同源基因的查找通常通过blast实现,我们也可以根据biomart数据库中实现,具体实现如下: library(biomaRt);library(dplyr) ensembl = useEnsembl("ensembl",host = "https://dec2021.archive.ensembl.org/") #获取数据库 species.1="mouse" species.2 ="tguttata" #在数据库中模糊匹配物种名(常用拉丁名)来获取的物种数据集的索引名 listDatasets(ensembl) %>% filter_all( any_vars(grepl("species.1", .)) ) listDatasets(ensembl) %>% filter_all( any_vars(grepl("species.2", .)) ) #以小鼠为例,通过模糊匹配"Mouse"可知小鼠GRCm39基因组对应的数据集索引名为:"mmusculus_gene_ensembl" > datasets %>% filter_all( any_vars(grepl(species.1, .)) ) dataset description version 1 mmurinus_gene_ensembl Mouse Lemur genes (Mmur_3.0) Mmur_3.0 2 mmusculus_gene_ensembl Mouse genes (GRCm39) GRCm39 #提取查询物种的基因信息数据集 mouse.geneset <- useMart('ensembl',dataset = "mmusculus_gene_ensembl",host = "https://dec2021.archive.ensembl.org/") tguttata.geneset <- useMart('ensembl',dataset = "tguttata_gene_ensembl",host = "https://dec2021.archive.ensembl.org/") ##在数据集内查询gene symbol 的属性名,还是通过模糊匹配的方式 > listAttributes(Mice) %>% filter_all( any_vars(grepl("symbol", .,ignore.case = T)) ) name description page 1 hgnc_symbol HGNC symbol feature_page 2 mgi_symbol MGI symbol feature_page 3 uniprot_gn_symbol UniProtKB Gene Name symbol feature_page # 对于小鼠的gene symbol一般是mgi_symbol(看自己查询的基因集的命名方式来选择),小鼠基因基因有自己的一套基因命名系统; # 而一些没有独自命名系统的物种一般用的是hgnc_symbol(人类基因的命名系统)来标识基因。 ##跨物种同源基因查询,基于tguttata的基因集查询每个基因对应的小鼠基因 tguttata.query_sets <- c("COPG1","VLDLR","TBC1D22A") tguttata2mouse <- tguttata.query_sets <- c("COPG1","VLDLR","TBC1D22A") getLDS(values = tguttata.query_sets, attributes = c("hgnc_symbol"),filters = "hgnc_symbol",mart = tguttata.geneset, attributesL = c("mgi_symbol"), martL = mouse.geneset,uniqueRows = T) > tguttata2mouse HGNC.symbol MGI.symbol 1 COPG1 Copg1 2 TBC1D22A Tbc1d22a 3 VLDLR Vldlr