Source code for magine.mappings.databases.download_libraries

import logging
import math
import os
import zipfile

from defusedxml import cElementTree as ElementTree
import pandas as pd
import requests

from magine.data.storage import id_mapping_dir
from magine.logging import get_logger

logger = get_logger(__name__, log_level=logging.INFO)


[docs]def load_hgnc(): hgnc_name = os.path.join(id_mapping_dir, 'hgnc.gz') if not os.path.exists(hgnc_name): hgnc = download_hgnc() _check_path(hgnc_name) else: hgnc = pd.read_csv(hgnc_name, low_memory=False) return hgnc.loc[hgnc.status == 'Approved']
[docs]def load_uniprot(): # gather data from uniprot uniprot_path = os.path.join(id_mapping_dir, 'human_uniprot.csv.gz') if not os.path.exists(uniprot_path): uniprot = download_uniprot() _check_path(uniprot_path) else: uniprot = pd.read_csv(uniprot_path, low_memory=False) return uniprot
[docs]def load_ncbi(): ncbi_name = os.path.join(id_mapping_dir, 'ncbi.gz') if not os.path.exists(ncbi_name): ncbi = download_ncbi() _check_path(ncbi_name) else: ncbi = pd.read_csv(ncbi_name, low_memory=False) return ncbi
def _check_path(path): if not os.path.exists(path): raise AssertionError() def download_uniprot(species='hsa'): """ `<https://www.uniprot.org/>`_ Parameters ---------- species : str Species name. Currently only human is supported. Please let us know if you need other species """ _url_h = 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/{}' if species == 'hsa': url = _url_h.format('HUMAN_9606_idmapping.dat.gz') else: raise ValueError("Currently only implemented human uniprot. " "Please contact us for additional databases") columns = ['uniprot', 'mapping_type', 'mapping'] df = _download(url, columns, name='uniprot', save=False, names=columns, compression='gzip') uniprot = pd.pivot_table(df, columns='mapping_type', index='uniprot', aggfunc='first') uniprot.columns = uniprot.columns.droplevel() uniprot.reset_index(inplace=True) outfile = os.path.join(id_mapping_dir, 'human_uniprot.csv.gz') uniprot.to_csv(outfile, compression='gzip', header=True, index=False) return uniprot def download_hgnc(): """ Downloads HGNC and stores it as a pandas.DataFrame `<http://www.genenames.org/>`_ """ url = 'ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/locus_groups/protein-coding_gene.txt' columns = ['symbol', 'uniprot_ids', 'ensembl_gene_id', 'name', 'location', 'entrez_id', 'ucsc_id', 'vega_id', 'alias_name', 'alias_symbol', 'status', 'gene_family', 'gene_family_id', 'ena', 'iuphar', 'cd', 'refseq_accession', 'ccds_id', 'pubmed_id', 'mgd_id', 'rgd_id', 'lsdb', 'bioparadigms_slc', 'enzyme_id', 'merops', 'horde_id', 'pseudogene.org', 'cosmic', 'rna_central_ids', 'omim_id', 'imgt', 'intermediate_filament_db', ] return _download(url, columns, 'hgnc') def download_ncbi(): """ Downloads data from NCBI `<https://www.ncbi.nlm.nih.gov/>`_ """ url = 'ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz' columns = ['GeneID', 'Symbol', 'description'] return _download(url, columns, 'ncbi', compression='gzip') def _download(url, columns, name, save=True, names=None, compression=None): logger.info("Downloading from {}.".format(name.upper())) if names is None: df = pd.read_table(url, delimiter='\t', low_memory=False, verbose=True, compression=compression) else: df = pd.read_table(url, delimiter='\t', low_memory=False, verbose=True, names=names, compression=compression) df = df[columns] if save: outfile = os.path.join(id_mapping_dir, '{}.gz'.format(name)) df.to_csv(outfile, compression='gzip', header=True, index=False) logger.info("Done downloading from {}.".format(name.upper())) return df def download_hmdb(): """ Downloads data from HMDB `<http://www.hmdb.ca/>`_ """ HMDB().download_db(fresh_download=True)
[docs]class HMDB(object): """ Downloads and processes HMDB metabolites database `<http://www.hmdb.ca/>`_ """ def __init__(self): self.id_dir = id_mapping_dir self.target_file = 'hmdb_metabolites.zip' self.out_name = os.path.join(self.id_dir, 'hmdb_dataframe.csv.gz') self.hmdb_file = os.path.join(self.id_dir, self.target_file)
[docs] def load_db(self, fresh_download=False): if not os.path.exists(self.out_name) or fresh_download: self.download_db(fresh_download) return pd.read_csv(self.out_name, low_memory=False, encoding='utf-8')
[docs] def download_db(self, fresh_download): """ parse HMDB to Pandas.DataFrame """ out_dir = os.path.join(self.id_dir, 'HMDB') # create output directory if not os.path.exists(out_dir): os.mkdir(out_dir) # download and unzip xml file if fresh_download or not len(os.listdir(out_dir)): self._download_hmdb() # unzips hmdb metabolites file logger.info("Unzipping metabolites file") zip_ref = zipfile.ZipFile(self.hmdb_file, 'r') zip_ref.extractall(out_dir) zip_ref.close() logger.info("Done unzipping metabolites file") # creates dataframe logger.info("Parsing metabolites information from files") df = pd.DataFrame([ self._create_dict(e) for ev, e in iter(ElementTree.iterparse( os.path.join(out_dir, os.listdir(out_dir)[0]), events=("start", "end")) ) if e.tag == '{http://www.hmdb.ca}metabolite' and ev == 'end' ]) for i in categories: if i in df.columns: df[i.split('}')[1]] = df[i] del df[i] df.to_csv(self.out_name, index=False, encoding='utf-8', compression='gzip') logger.info("Done processing HMDB")
def _download_hmdb(self): """ Downloads hmdb metabolites xml file """ logger.info("Downloading metabolites information from HMDB") ur = 'http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip' r = requests.get(ur, stream=True) file_size = int(r.headers['content-length']) logger.info("File size is {} bytes".format(file_size)) file_size_dl = 0 block_sz = 1024 # block_sz = 8024 v = set() milestone_markers = range(0, 101, 10) with open(self.hmdb_file, 'wb') as f: for chunk in r.iter_content(chunk_size=block_sz): file_size_dl += len(chunk) percent_done = int(math.floor(file_size_dl * 100. / file_size)) if percent_done in milestone_markers: if percent_done not in v: logger.info("{}%".format(percent_done)) v.add(percent_done) if chunk: # filter out keep-alive new chunks f.write(chunk) logger.info("Done downloading {}.".format(ur)) @staticmethod def _create_dict(elem): template = {} for i in categories: n = elem.find(i) if n is None: # print(i) continue elif i == '{http://www.hmdb.ca}protein_associations': output = [] for pr in n.findall('{http://www.hmdb.ca}protein'): for gn in pr.findall('{http://www.hmdb.ca}gene_name'): if gn.text is not None: output.append(gn.text) template[i] = '|'.join(output) elif i == '{http://www.hmdb.ca}synonyms': template[i] = '|'.join( [p.text for p in n.findall('{http://www.hmdb.ca}synonym')] ) elif i == '{http://www.hmdb.ca}secondary_accessions': accession = n.findall('{http://www.hmdb.ca}accession') if len(accession) == 0: accession = '' else: accession = '|'.join(sorted([a.text for a in accession])) template[i] = accession else: template[i] = n.text elem.clear() return template
categories = ['kegg_id', 'name', 'accession', 'chebi_id', 'chemspider_id', 'biocyc_id', 'synonyms', 'pubchem_compound_id', 'protein_associations', 'inchikey', 'iupac_name', 'drugbank_id', 'chemical_formula', 'smiles', 'metlin_id', 'average_molecular_weight', 'secondary_accessions', # 'normal_concentrations', # 'molecular_framework' # 'pathways', ] categories = ['{http://www.hmdb.ca}' + i for i in categories] valid_uniprot_cols = ['uniprot', 'Allergome', 'BioCyc', 'BioGrid', 'BioMuta', 'CCDS', 'CRC64', 'ChEMBL', 'ChiTaRS', 'CleanEx', 'DIP', 'DMDM', 'DNASU', 'DisProt', 'DrugBank', 'EMBL', 'EMBL-CDS', 'ESTHER', 'Ensembl', 'Ensembl_PRO', 'Ensembl_TRS', 'GI', 'GeneCards','GeneDB', 'GeneID', 'GeneReviews', 'GeneTree', 'GeneWiki', 'Gene_Name', 'Gene_ORFName', 'Gene_Synonym', 'GenomeRNAi', 'GuidetoPHARMACOLOGY', 'H-InvDB', 'HGNC', 'HOGENOM', 'HOVERGEN', 'HPA', 'KEGG', 'KO', 'MEROPS', 'MIM', 'MINT', 'NCBI_TaxID', 'OMA', 'Orphanet', 'OrthoDB', 'PATRIC', 'PDB', 'PeroxiBase', 'PharmGKB', 'REBASE', 'Reactome', 'RefSeq', 'RefSeq_NT', 'STRING', 'SwissLipids', 'TCDB', 'TreeFam', 'UCSC', 'UniGene', 'UniParc', 'UniPathway', 'UniProtKB-ID', 'UniRef100', 'UniRef50', 'UniRef90', 'eggNOG', 'neXtProt'] ncbi_valid_categories = ['GeneID', 'Symbol', 'description'] hgnc_valid_categories = ['symbol', 'uniprot_ids', 'ensembl_gene_id', 'name', 'location', 'entrez_id', 'ucsc_id', 'vega_id', 'alias_name', 'alias_symbol', 'status', 'gene_family', 'gene_family_id', 'ena', 'iuphar', 'cd', 'refseq_accession', 'ccds_id', 'pubmed_id', 'mgd_id', 'rgd_id', 'lsdb', 'bioparadigms_slc', 'enzyme_id', 'merops', 'horde_id', 'pseudogene.org', 'cosmic', 'rna_central_ids', 'omim_id', 'imgt', 'intermediate_filament_db' ] if __name__ == '__main__': # download_hgnc() download_uniprot('hsa') # hmdb = HMDB() # df = hmdb.load_db(fresh_download=True) # print(df.head(10)) # print(df.columns)