#### Get gene names from Ensembl IDs

In [9]:
import pickle
import biomart

In [3]:
# FROM https://autobencoder.com/2021-10-03-gene-conversion/#python

def get_ensembl_mappings():                                   
    # Set up connection to server                                               
    server = biomart.BiomartServer('http://uswest.ensembl.org/biomart')         
    mart = server.datasets['mmusculus_gene_ensembl']                            
                                                                                
    # List the types of data we want                                            
    attributes = ['ensembl_transcript_id', 'mgi_symbol', 
                  'ensembl_gene_id', 'ensembl_peptide_id']
                                                                                
    # Get the mapping between the attributes                                    
    response = mart.search({'attributes': attributes})                          
    data = response.raw.data.decode('ascii')                                    
                                                                                
    ensembl_to_genesymbol = {}                                                  
    # Store the data in a dict                                                  
    for line in data.splitlines():                                              
        line = line.split('\t')                                                 
        # The entries are in the same order as in the `attributes` variable
        transcript_id = line[0]                                                 
        gene_symbol = line[1]                                                   
        ensembl_gene = line[2]                                                  
        ensembl_peptide = line[3]                                               
                                                                                
        # Some of these keys may be an empty string. If you want, you can 
        # avoid having a '' key in your dict by ensuring the 
        # transcript/gene/peptide ids have a nonzero length before
        # adding them to the dict
        ensembl_to_genesymbol[transcript_id] = gene_symbol                      
        ensembl_to_genesymbol[ensembl_gene] = gene_symbol                       
        ensembl_to_genesymbol[ensembl_peptide] = gene_symbol                
                                                                                
    return ensembl_to_genesymbol

In [4]:
ensembl_to_genesymbol = get_ensembl_mappings()

In [8]:
len(ensembl_to_genesymbol)

265025

In [11]:
with open('ensembl_to_genesymbol.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(ensembl_to_genesymbol, f, pickle.HIGHEST_PROTOCOL)

In [12]:
with open('ensembl_to_genesymbol.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [15]:
b == ensembl_to_genesymbol

True

In [16]:
b

{'ENSMUST00000082387': 'mt-Tf',
 'ENSMUSG00000064336': 'mt-Tf',
 '': 'Il36rn',
 'ENSMUST00000082388': 'mt-Rnr1',
 'ENSMUSG00000064337': 'mt-Rnr1',
 'ENSMUST00000082389': 'mt-Tv',
 'ENSMUSG00000064338': 'mt-Tv',
 'ENSMUST00000082390': 'mt-Rnr2',
 'ENSMUSG00000064339': 'mt-Rnr2',
 'ENSMUST00000082391': 'mt-Tl1',
 'ENSMUSG00000064340': 'mt-Tl1',
 'ENSMUST00000082392': 'mt-Nd1',
 'ENSMUSG00000064341': 'mt-Nd1',
 'ENSMUSP00000080991': 'mt-Nd1',
 'ENSMUST00000082393': 'mt-Ti',
 'ENSMUSG00000064342': 'mt-Ti',
 'ENSMUST00000082394': 'mt-Tq',
 'ENSMUSG00000064343': 'mt-Tq',
 'ENSMUST00000082395': 'mt-Tm',
 'ENSMUSG00000064344': 'mt-Tm',
 'ENSMUST00000082396': 'mt-Nd2',
 'ENSMUSG00000064345': 'mt-Nd2',
 'ENSMUSP00000080992': 'mt-Nd2',
 'ENSMUST00000082397': 'mt-Tw',
 'ENSMUSG00000064346': 'mt-Tw',
 'ENSMUST00000082398': 'mt-Ta',
 'ENSMUSG00000064347': 'mt-Ta',
 'ENSMUST00000082399': 'mt-Tn',
 'ENSMUSG00000064348': 'mt-Tn',
 'ENSMUST00000082400': 'mt-Tc',
 'ENSMUSG00000064349': 'mt-Tc',
 'ENSMUST