Source code for magine.enrichment.enrichr

import json
import logging
import os
import re

import numpy as np
import pandas as pd
import requests

from magine.enrichment.enrichment_result import EnrichmentResult
from magine.logging import get_logger
from magine.plotting.species_plotting import write_table_to_html

logger = get_logger(__name__, log_level=logging.INFO)

_valid_libs = set()
with open(os.path.join(os.path.dirname(__file__),
                       '_valid_enricher_libs.txt'), 'r') as f:
    for n in f.read().splitlines():
        _valid_libs.add(n)

gene = 'gene'

db_types = {
    'histone': [
        'Epigenomics_Roadmap_HM_ChIP-seq',
        'ENCODE_Histone_Modifications_2015',
        'ESCAPE',
    ],
    'mrna': [
        'TargetScan_microRNA_2017',
        'miRTarBase_2017',
    ],
    'kinases': [
        'KEA_2015',
        'Kinase_Perturbations_from_GEO_down',
        'Kinase_Perturbations_from_GEO_up',
        'Phosphatase_Substrates_from_DEPOD',
        'ARCHS4_Kinases_Coexp',
        'ARCHS4_IDG_Coexp',
    ],
    'transcription': [
        'ChEA_2016',
        'TRANSFAC_and_JASPAR_PWMs',
        'ARCHS4_TFs_Coexp',
        'Enrichr_Submissions_TF-Gene_Coocurrence',
        'Genome_Browser_PWMs',
        'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
        'ENCODE_TF_ChIP-seq_2015',
        'TF-LOF_Expression_from_GEO',
        'Transcription_Factor_PPIs'
    ],
    'pathways': [
        'KEGG_2016',
        'WikiPathways_2016',
        'Reactome_2016',
        'BioCarta_2016',
        'NCI-Nature_2016',
        'Panther_2016',
    ],
    'complex': [
        'NURSA_Human_Endogenous_Complexome',
        'CORUM',
        'PPI_Hub_Proteins',
        'BioPlex_2017',
    ],
    'ontologies': [
        'GO_Cellular_Component_2018',
        'GO_Biological_Process_2018',
        'GO_Molecular_Function_2018',
        'MGI_Mammalian_Phenotype_2017',
        'Jensen_COMPARTMENTS',
    ],
    'drug': [
        'DrugMatrix',
        'Drug_Perturbations_from_GEO_2014',
        'Old_CMAP_up',
        'Old_CMAP_down',
        'LINCS_L1000_Chem_Pert_up',
        'LINCS_L1000_Chem_Pert_down',
        'LINCS_L1000_Ligand_Perturbations_up',
        'LINCS_L1000_Ligand_Perturbations_down',
    ],
    'disease': [
        'OMIM_Disease',
        'OMIM_Expanded',
        'Jensen_DISEASES',
        'Human_Phenotype_Ontology'
    ],
    'cell_type': [

        'ARCHS4_Tissues',
        'ARCHS4_Cell-lines',
        'Allen_Brain_Atlas_up',
        'Allen_Brain_Atlas_down',
        'Cancer_Cell_Line_Encyclopedia',
        'Human_Gene_Atlas',
        'NCI-60_Cancer_Cell_Lines',
        # 'Jensen_TISSUES',
    ],
    'crowd': [
        'Disease_Perturbations_from_GEO_down',
        'Disease_Perturbations_from_GEO_up',
        'Drug_Perturbations_from_GEO_down',
        'Drug_Perturbations_from_GEO_up',
        'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO',
        'Aging_Perturbations_from_GEO_up',
        'Aging_Perturbations_from_GEO_down',
        'Ligand_Perturbations_from_GEO_up',
        'Ligand_Perturbations_from_GEO_down',
        'MCF7_Perturbations_from_GEO_up',
        'MCF7_Perturbations_from_GEO_down',
        'Microbe_Perturbations_from_GEO_up',
        'Microbe_Perturbations_from_GEO_down',
        'SysMyo_Muscle_Gene_Sets',
    ]

}


def get_libraries():
    url = 'http://amp.pharm.mssm.edu/Enrichr/datasetStatistics'
    libs_json = json.loads(requests.get(url).text)
    return [lib['libraryName'] for lib in libs_json['statistics']]


[docs]class Enrichr(object): _query = '{url}/enrich?userListId={list_id}&backgroundType={lib}' def __init__(self, verbose=False): self._url = 'http://maayanlab.cloud/Enrichr/' self._valid_libs = _valid_libs self.verbose = verbose
[docs] def print_valid_libs(self): """ Print a list of all available libraries EnrichR has to offer. """ for lib_name in sorted(self._valid_libs): print(lib_name)
[docs] def run(self, list_of_genes, gene_set_lib='GO_Biological_Process_2017'): """ Parameters ---------- list_of_genes : list_like List of genes using HGNC gene names gene_set_lib : str or list Name of gene set library To print options use Enrichr.print_valid_libs Examples -------- >>> import pandas as pd >>> pd.set_option('display.max_colwidth', 40) >>> pd.set_option('precision', 3) >>> e = Enrichr() >>> df = e.run(['BAX', 'BCL2', 'CASP3', 'CASP8'], \ gene_set_lib='Reactome_2016') >>> print(df[['term_name','combined_score']].head(5))#doctest: +NORMALIZE_WHITESPACE term_name combined_score 0 intrinsic pathway for apoptosis hsa ... 11814.410 1 apoptosis hsa r-hsa-109581 2365.141 2 programmed cell death hsa r-hsa-5357801 2313.527 3 caspase-mediated cleavage of cytoske... 10944.261 4 caspase activation via extrinsic apo... 4245.542 Returns ------- df : EnrichmentResult Results from enrichR """ if not isinstance(list_of_genes, (list, set)): raise AssertionError("list_of_genes must be list like") logger.debug("Running Enrichr with gene set {}".format(gene_set_lib)) user_list_id = self._add_gene_list(list_of_genes) if isinstance(gene_set_lib, str): df = self._run_id(user_list_id, gene_set_lib) else: df = self._run_list_of_dbs(user_list_id, gene_set_lib) init_size = len(df) if init_size == 0: logger.debug("No terms returned") return df # set all terms with adj_p_value less than 0.05 to significant. df['significant'] = False crit = (df['adj_p_value'] <= 0.05) df.loc[crit, 'significant'] = True after_size = len(df) if init_size != after_size: raise AssertionError('not the same shape {}'.format(gene_set_lib)) logger.debug("Done calling Enrichr.") return df
[docs] def run_samples(self, sample_lists, sample_ids, gene_set_lib='GO_Biological_Process_2017', save_name=None, create_html=False, out_dir=None, run_parallel=False, exp_data=None, pivot=False): """ Run enrichment analysis on a list of samples. Parameters ---------- sample_lists : list_like List of lists of genes for enrichment analysis sample_ids : list list of ids for the provided sample list gene_set_lib : str, list Type of gene set, refer to Enrichr.print_valid_libs save_name : str, optional if provided it will save a file as a pivoted table with the term_ids vs sample_ids create_html : bool Creates html of output with plots of species across sample out_dir : str If create_html, it will place all html plots into this directory run_parallel : bool If create_html, it will create plots using multiprocessing exp_data : magine.data.ExperimentalData Must be provided if create_html=True pivot : bool Examples -------- .. plot:: :context: close-figs >>> import pandas as pd >>> import matplotlib.pyplot as plt >>> from magine.enrichment.enrichr import Enrichr >>> pd.set_option('display.max_colwidth', 40) >>> pd.set_option('precision', 3) >>> samples = [['BAX', 'BCL2', 'CASP3', 'CASP8'], \ ['ATR', 'ATM', 'TP53', 'CHEK1']] >>> sample_ids = ['apoptosis', 'dna_repair'] >>> e = Enrichr() >>> df = e.run_samples(samples, sample_ids, \ gene_set_lib='Reactome_2016') >>> print(df[['term_name','combined_score']].head(5))#doctest: +NORMALIZE_WHITESPACE term_name combined_score 0 intrinsic pathway for apoptosis hsa ... 11814.410 1 apoptosis hsa r-hsa-109581 2365.141 2 programmed cell death hsa r-hsa-5357801 2313.527 3 caspase-mediated cleavage of cytoske... 10944.261 4 caspase activation via extrinsic apo... 4245.542 >>> df.filter_multi(rank=10, inplace=True) >>> df['term_name'] = df['term_name'].str.split('_').str.get(0) >>> fig = df.sig.heatmap(figsize=(6, 6), linewidths=.05) Returns ------- EnrichmentResult """ if not isinstance(sample_lists, list): raise AssertionError("List required") if not isinstance(sample_lists[0], (list, set)): raise AssertionError("List of lists required") df_final = self.run(sample_lists[0], gene_set_lib) df_final['sample_id'] = sample_ids[0] for count, (i, j) in enumerate(zip(sample_lists[1:], sample_ids[1:])): df = self.run(i, gene_set_lib) df['sample_id'] = j df_final = df_final.append(df, ignore_index=True) # removes terms that do not have at least 1 signficant term across any # of the samples list provided df_final.require_n_sig(n_sig=1, inplace=True) df_final = EnrichmentResult(df_final) if save_name: s_name = '{}_enrichr'.format(save_name) if pivot: # pivot output p_df = pd.pivot_table( df_final, index=['term_name', 'db'], columns='sample_id', aggfunc='first', fill_value=np.nan, values=['term_name', 'rank', 'p_value', 'z_score', 'combined_score', 'adj_p_value', 'genes', 'significant'], ) # save files p_df.to_excel('{}.xlsx'.format(s_name), merge_cells=True) df_final.to_csv('{}.csv'.format(s_name), index=False) if create_html: if exp_data is None: raise AssertionError("exp_data required for plots") write_table_to_html(data=df_final, save_name=save_name, out_dir=out_dir, run_parallel=run_parallel, exp_data=exp_data) return df_final
def _run_id(self, list_id, gene_set_lib): if gene_set_lib not in _valid_libs: raise AssertionError("{} not in valid ids {}".format(gene_set_lib, _valid_libs)) q = self._query.format(url=self._url, list_id=list_id, lib=gene_set_lib) response = requests.get(q, timeout=300) if not response.ok: logger.warn("{} library failed to run on enrichRs side. " "View their response for more info {}" "".format(gene_set_lib, q)) return EnrichmentResult() data = json.loads(response.text) if gene_set_lib not in data: raise Exception("{} not in enrichR".format(gene_set_lib)) if len(data[gene_set_lib]) == 0: return EnrichmentResult() ##### # ENRICHR return a list of entries with each entry having these terms # Rank, Term name, P-value, Z-score, Combined score, Overlapping genes, # Adjusted p-value, Old p-value, Old adjusted p-value ##### df = EnrichmentResult( data[gene_set_lib], columns=['rank', 'term_name', 'p_value', 'z_score', 'combined_score', 'gene_hits', 'adj_p_value', '_', '_'] ) def compress_genes(row): return ','.join(g for g in sorted(row['gene_hits'])) def get_length(row): return len(row['gene_hits']) # compress genes into string and get count df['genes'] = df.apply(compress_genes, axis=1) df['n_genes'] = df.apply(get_length, axis=1) # remove term names if null df = df[~df['term_name'].isnull()].copy() # keep only desired columns cols = ['term_name', 'rank', 'p_value', 'z_score', 'combined_score', 'adj_p_value', 'genes', 'n_genes'] df = df[cols] df['db'] = gene_set_lib df['combined_score'] = pd.to_numeric(df['combined_score'], errors='coerce') with pd.option_context('mode.use_inf_as_null', True): # remove rows without a term name, nans, or infinite values df = df.dropna( subset=['term_name', 'adj_p_value', 'combined_score'] ) try: df = _prepare_output(df) except: logger.warning("Unable to simplify enrichR term names. This could " "be due to a change in enrichR formatting. " "Returning as default output.") return df def _run_list_of_dbs(self, gene_list_id, databases): data = self._run_id(gene_list_id, databases[0]) for db in databases[1:]: data = data.append(self._run_id(gene_list_id, db), ignore_index=True) logger.debug('\t\t{}/{} databases'.format(db, len(databases))) return data def _add_gene_list(self, gene_list): """ Upload to enrichr Parameters ---------- gene_list : list Returns ------- str : userListId that maps to gene list provided, used within enrichR to gather enrichment results. """ genes_str = '\n'.join(gene_list) payload = { 'list': (None, genes_str), 'description': (None, 'MAGINE analysis') } response = requests.post(self._url + '/addList', files=payload) if not response.ok: raise Exception('Error analyzing gene list', response.ok) data = json.loads(response.text) return data['userListId']
def _prepare_output(df): df['term_name'] = df.apply(clean_term_names, axis=1) return df
[docs]def clean_term_names(row): term_name = row['term_name'] if not isinstance(term_name, str): return term_name db = row['db'] if db in [ 'GO_Biological_Process_2018', 'GO_Molecular_Function_2018', 'GO_Cellular_Component_2018', 'GO_Biological_Process_2017', 'GO_Molecular_Function_2017', 'GO_Cellular_Component_2017', 'GO_Biological_Process_2017b', 'GO_Molecular_Function_2017b', 'GO_Cellular_Component_2017b' ]: if 'GO:' in term_name: term_name = term_name.split('(GO:', 1)[0] elif 'go:' in term_name: term_name = term_name.split('(go:', 1)[0] elif db == 'Human_Phenotype_Ontology': if 'HP:' in term_name: term_name = term_name.split('(HP:', 1)[0] elif db == 'MGI_Mammalian_Phenotype_2017': if term_name.startswith('MP:'): term_name = term_name.split(' ', 1)[1] elif db == 'DrugMatrix': try: drug_name = re.search(r'^(.*)(-\d*.*\d_)', term_name).group(1) direction = re.search(r'-(.{2})$', term_name).group(0) term_name = drug_name + direction except: term_name = term_name elif db in ['Old_CMAP_down', 'Old_CMAP_up']: term_name = term_name.rsplit('-', 1)[0] elif db in ['Ligand_Perturbations_from_GEO_down', 'Ligand_Perturbations_from_GEO_up']: term_name = term_name.split('_', 1)[0] term_name = term_name.strip() term_name = term_name.lower() for i, j in replace_pairs: if i in term_name: term_name = term_name.replace(i, j) return term_name
[docs]def clean_lincs(df): """ Cleans the lincs databases term_names from enrichR. Parameters ---------- df Returns ------- """ pattern = re.compile( r'(?P<id>\w+) (?P<cell>\w+) (?P<time>\w+)-(?P<drug>.*)-(\S*)') def get_drug(row): db = row['db'] term_name = row['term_name'] if db not in ['LINCS_L1000_Chem_Pert_up', 'LINCS_L1000_Chem_Pert_down']: return term_name try: drug_name = pattern.search(term_name).group('drug') return drug_name except: # If the string doesnt have the pattern above, there is no way # to properly parse it. Ran into a few examples where they used # spaces instead of "_". Hard to parse since the names of the drugs # can also have spaces ("sulfide salts") # ex. cpc006 snuc5 6h-quinine hemisulfate salt monohydrate-10.0 return term_name df['term_name'] = df.apply(get_drug, axis=1) return df
[docs]def clean_drug_pert_geo(df): def get_drug(row): db = row['db'] term_name = row['term_name'] if db != 'Drug_Perturbations_from_GEO_2014': return term_name try: segs = term_name.split('_') drug_name = segs[0] + '_' + segs[-1][0] + segs[-1][-1] except NameError: drug_name = term_name return drug_name df['term_name'] = df.apply(get_drug, axis=1) return df
def clean_drug_dbs(data): df_copy = data.copy() if 'Drug_Perturbations_from_GEO_2014' in df_copy.db.unique(): df_copy = clean_drug_pert_geo(df_copy) if 'LINCS_L1000_Chem_Pert_up' in df_copy.db.unique() or \ 'LINCS_L1000_Chem_Pert_down' in df_copy.db.unique(): df_copy = clean_lincs(df_copy) return df_copy
[docs]def clean_tf_names(data): """ Cleans transcription factors databases by removing everything after '_'. Parameters ---------- data : pd.DataFrame Returns ------- """ def return_single_tf(row): """ Gets TF name only """ tf = row['term_name'] if '_' in tf: tf = tf.split('_')[0] return tf.upper() tf_dbs = ['ARCHS4_TFs_Coexp', 'ChEA_2016', 'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X', 'ENCODE_TF_ChIP-seq_2015', 'Enrichr_Submissions_TF-Gene_Coocurrence', 'TRANSFAC_and_JASPAR_PWMs', 'TF-LOF_Expression_from_GEO', 'Transcription_Factor_PPIs'] tfs = data[data['db'].isin(tf_dbs)].copy() tfs['term_name'] = tfs.apply(return_single_tf, axis=1) return tfs
replace_pairs = [ ('mus musculus', 'mus'), ('homo sapiens', 'hsa'), ('rattus norvegicus', 'rat'), ('oncorhynchus mykiss', 'onc_mykiss'), ('macaca fascicularis', 'mfa'), ('oryzias latipes', 'ola'), ('sus scrofa', 'ssc'), ('danio rerio', 'dre'), ('bos taurus', 'bta'), ('dictyostelium discoideum', 'dicty'), ('myzus persicae', 'm.persicae'), ('staphylococcus aureus', 's.aureus'), ('escherichia coli', 'e.coli'), ('pseudomonas aeruginosa', 'p.aeruginosa'), ('drosophila melanogaste', 'fly'), ('mycobacterium tuberculosis', 'm.tuberculosis'), ('hordeum vulgare', 'h.vulgare'), (' (mouse)', '_mus'), (' (human)', '_hsa'), ('Homo sapiens', 'hsa'), ('Mus musculus', 'mus'), ('hg19', '_hsa'), ('mm9', '_mus'), (' hsa r-hsa', '_hsa') ] standard_dbs = [] for db in ['drug', 'disease', 'ontologies', 'pathways', 'transcription', 'kinases', 'histone', 'cell_type']: standard_dbs += db_types[db]
[docs]def get_background_list(lib_name): """ Return reference list for given gene referecen set Parameters ---------- lib_name : str Returns ------- """ enrichment_url = 'http://amp.pharm.mssm.edu/Enrichr/geneSetLibrary' query_string = '?userListId&libraryName={}' response = requests.get( enrichment_url + query_string.format(lib_name) ) if not response.ok: raise Exception('Error fetching enrichment results') results = json.loads(response.text) term_to_gene = [] if lib_name not in results: raise AssertionError("Something happened when calling enrichR.") for term, genes_dict in results[lib_name]['terms'].items(): genes = sorted(set(i for i in genes_dict)) term_to_gene.append( dict(term=term.lower(), gene_list=genes, n_genes=len(genes)) ) return term_to_gene
[docs]def run_enrichment_for_project(exp_data, project_name, databases=None, output_path=None): """ Parameters ---------- exp_data : magine.data.experimental_data.ExprerimentalData project_name : str databases : list output_path : str Location to save all individual enrichment output files created. """ if databases is None: databases = standard_dbs logger.info("Running enrichment on project") logger.info("Running {} databases".format(len(databases))) e = Enrichr(verbose=True) all_df = [] if output_path is None: _dir = os.path.join(os.getcwd(), 'enrichment_output') else: _dir = output_path if not os.path.exists(_dir): logger.info("Creating output directory: {}".format(_dir)) os.mkdir(_dir) def _run_new(samples, timepoints, category): logger.info("Running {}".format(category)) for genes, sample_id in zip(samples, timepoints): if not len(genes): continue logger.info('sample_id = {}'.format(sample_id)) current = "{}_{}_{}".format(category, sample_id, project_name) name = os.path.join(_dir, current + '.csv.gz') try: df = pd.read_csv(name, index_col=None, encoding='utf-8') except: df = e.run(genes, databases) df['sample_id'] = sample_id df['category'] = category df.to_csv(name, index=False, encoding='utf-8', compression='gzip') df['sample_id'] = sample_id df['category'] = category all_df.append(df) # run each experimental 'source' by time point # ( label-free, ph-silac, etc are all separate) for source in exp_data.exp_methods: df = exp_data[source].sig col_options = df['species_type'].unique() # make sure there is only a single species type and it is protein # this makes sure that the RNA seq doesnt get ran twice if len(col_options) == 1 and col_options[0] == 'protein': _run_new(df.by_sample, df.sample_ids, '{}_both'.format(source)) _run_new(df.up_by_sample, df.sample_ids, '{}_up'.format(source)) _run_new(df.down_by_sample, df.sample_ids, '{}_down'.format(source)) # run all protein labeled species grouped by time point # ( label-free, ph-silac, etc are all combined) if len(exp_data.proteins.sample_ids) != 0: sample = exp_data.proteins.sig if len(sample.source.unique()) == 1: print("Only single source found. " "No aggregation enrichment to perform") else: _run_new(sample.by_sample, sample.sample_ids, 'proteomics_both') _run_new(sample.up_by_sample, sample.sample_ids, 'proteomics_up') _run_new(sample.down_by_sample, sample.sample_ids, 'proteomics_down') # run all RNA labeled species grouped by time point if len(exp_data.rna.sample_ids) != 0: sample = exp_data.rna.sig _run_new(sample.by_sample, sample.sample_ids, 'rna_both') _run_new(sample.down_by_sample, sample.sample_ids, 'rna_down') _run_new(sample.up_by_sample, sample.sample_ids, 'rna_up') # merge all outputs final_df = pd.concat(all_df, ignore_index=True) final_df = final_df[ ['term_name', 'rank', 'combined_score', 'adj_p_value', 'genes', 'n_genes', 'sample_id', 'category', 'db'] ] final_df.dropna(inplace=True) # enrichR started returning Infinite values, which are returned as str # rather than numeric. To filter, this must be converted to numeric final_df['combined_score'] = pd.to_numeric(final_df['combined_score'], errors='coerce') with pd.option_context('mode.use_inf_as_null', True): # remove rows without a term name, nans, or infinite values final_df = final_df.dropna( subset=['term_name', 'adj_p_value', 'combined_score'] ) # Adds significant column final_df.loc[(final_df['adj_p_value'] <= 0.05) & (final_df['combined_score'] > 0.0), 'significant'] = True logger.info("Saving output: {}_enrichment.csv.gz".format(project_name)) final_df.to_csv('{}_enrichment.csv.gz'.format(project_name), encoding='utf-8', compression='gzip', index=False)