biomart添加同源基因

同源基因的查找通常通过blast实现,我们也可以根据biomart数据库中实现,具体实现如下:

library(biomaRt);library(dplyr)
ensembl = useEnsembl("ensembl",host = "https://dec2021.archive.ensembl.org/") #获取数据库
species.1="mouse"
species.2 ="tguttata"
#在数据库中模糊匹配物种名(常用拉丁名)来获取的物种数据集的索引名
listDatasets(ensembl)  %>% filter_all( any_vars(grepl("species.1", .)) ) 
listDatasets(ensembl)  %>% filter_all( any_vars(grepl("species.2", .)) ) 
#以小鼠为例,通过模糊匹配"Mouse"可知小鼠GRCm39基因组对应的数据集索引名为:"mmusculus_gene_ensembl"
> datasets %>% filter_all( any_vars(grepl(species.1, .)) )
                 dataset                  description  version
1  mmurinus_gene_ensembl Mouse Lemur genes (Mmur_3.0) Mmur_3.0
2 mmusculus_gene_ensembl         Mouse genes (GRCm39)   GRCm39

#提取查询物种的基因信息数据集
mouse.geneset <- useMart('ensembl',dataset = "mmusculus_gene_ensembl",host = "https://dec2021.archive.ensembl.org/")
tguttata.geneset <- useMart('ensembl',dataset = "tguttata_gene_ensembl",host = "https://dec2021.archive.ensembl.org/")

##在数据集内查询gene symbol 的属性名,还是通过模糊匹配的方式
> listAttributes(Mice) %>% filter_all( any_vars(grepl("symbol", .,ignore.case = T)) )
               name                description         page
1       hgnc_symbol                HGNC symbol feature_page
2        mgi_symbol                 MGI symbol feature_page
3 uniprot_gn_symbol UniProtKB Gene Name symbol feature_page
# 对于小鼠的gene symbol一般是mgi_symbol(看自己查询的基因集的命名方式来选择),小鼠基因基因有自己的一套基因命名系统;
# 而一些没有独自命名系统的物种一般用的是hgnc_symbol(人类基因的命名系统)来标识基因。

##跨物种同源基因查询,基于tguttata的基因集查询每个基因对应的小鼠基因
tguttata.query_sets <- c("COPG1","VLDLR","TBC1D22A")
tguttata2mouse <- tguttata.query_sets <- c("COPG1","VLDLR","TBC1D22A")
getLDS(values = tguttata.query_sets,
       attributes = c("hgnc_symbol"),filters = "hgnc_symbol",mart = tguttata.geneset,
       attributesL = c("mgi_symbol"), martL = mouse.geneset,uniqueRows = T)

> tguttata2mouse
  HGNC.symbol MGI.symbol
1       COPG1      Copg1
2    TBC1D22A   Tbc1d22a
3       VLDLR      Vldlr