Source code for magine.enrichment.enrichment_result

import itertools
import operator
from collections import OrderedDict
from itertools import combinations

import numpy as np
import pandas as pd

from magine.data.base import BaseData
from magine.plotting.heatmaps import cluster_distance_mat

# Will be OK in Python 2
try:
    basestring
# Allows isinstance(foo, basestring) to work in Python 3
except:
    basestring = str

sig = 'significant'


[docs]def load_enrichment_csv(file_name, **args): """ Load data into EnrichmentResult data class Parameters ---------- file_name : str Returns ------- EnrichmentResult """ d = pd.read_csv(file_name, **args) return EnrichmentResult(d)
[docs]class EnrichmentResult(BaseData): def __init__(self, *args, **kwargs): super(EnrichmentResult, self).__init__(*args, **kwargs) self._index = 'term_name' self._identifier = 'term_name' self._value_name = 'combined_score' self._sample_id_name = 'sample_id' @property def _constructor(self): return EnrichmentResult
[docs] def filter_rows(self, column, options, inplace=False): """ Filters a pandas dataframe provides a column and filter selection. Parameters ---------- column : str options : str, list Can be a single entry or a list inplace : bool Filter inplace Returns ------- pd.DataFrame """ new_data = self.copy() valid_opts = sorted(new_data[column].unique()) if isinstance(options, str): if options not in valid_opts: print('{} not in {}'.format(options, valid_opts)) else: new_data = new_data[new_data[column] == options] elif isinstance(options, list): for i in options: if i not in valid_opts: print('{} not in {}'.format(i, valid_opts)) new_data = new_data[new_data[column].isin(options)] if inplace: self._update_inplace(new_data) else: return new_data
[docs] def filter_multi(self, p_value=None, combined_score=None, db=None, sample_id=None, category=None, rank=None, inplace=False): """ Filters an enrichment array. This is an aggregate function that allows ones to filter an entire dataframe with a single function call. Parameters ---------- p_value : float filters all values less than or equal combined_score : float filters all values greater than or equal db : str, list sample_id : str, list category : str, list rank : int inplace : bool Filter inplace Returns ------- new_data : EnrichmentResult """ new_data = self.copy() if p_value is not None: if not isinstance(p_value, (int, float)): raise AssertionError("p_value must be a float or int") new_data = new_data[new_data['adj_p_value'] <= p_value] if combined_score is not None: if not isinstance(combined_score, (int, float)): raise AssertionError("combined_score must be a float or int") new_data = new_data[new_data['combined_score'] >= combined_score] if isinstance(rank, (int, float)): new_data = new_data[new_data['rank'] <= rank] if db is not None: new_data.filter_rows('db', db, inplace=True) if sample_id is not None: new_data.filter_rows('sample_id', sample_id, inplace=True) if category is not None: new_data.filter_rows('category', category, inplace=True) if inplace: self._update_inplace(new_data) else: return new_data
[docs] def term_to_genes(self, term): """ Get set of genes of provides term(s) Parameters ---------- term : str, list Returns ------- set """ if isinstance(term, basestring): genes = self[self['term_name'].isin([term])]['genes'] else: genes = self[self['term_name'].isin(term)]['genes'] return set(itertools.chain.from_iterable(genes.str.split(',').values))
[docs] def term_to_genes_dict(self, term_list=None): """ Parameters ---------- term_list : list Returns ------- OrderedDict """ if term_list is None: term_list = set(self['term_name'].values) elif isinstance(term_list, basestring): term_list = [term_list] gene_to_term = {} for term in term_list: gene_list = self.term_to_genes(term) for g in gene_list: if g not in gene_to_term: gene_to_term[g] = set() gene_to_term[g].add(term) term_to_gene = {} for i, j in gene_to_term.items(): name = ','.join(sorted(j)) if name not in term_to_gene: term_to_gene[name] = set() term_to_gene[name].add(i) return OrderedDict( sorted(term_to_gene.items(), key=operator.itemgetter(0)) )
[docs] def all_genes_from_df(self): """ Returns all genes from gene columns in a set Returns ------- set """ return set( itertools.chain.from_iterable(self['genes'].str.split(',').values) )
[docs] def filter_based_on_words(self, words, inplace=False): """ Filter term_name based on key terms Parameters ---------- words : list, str List of words to use to keep rows in dataframe inplace : bool Filter the dataframe in place or return filtered copy Returns ------- pandas.DataFrame """ if isinstance(words, str): words = [words] df = self.copy() df = df[df['term_name'].str.lower().str.contains('|'.join(words))] if inplace: self._update_inplace(df) else: return df
[docs] def find_similar_terms(self, term, level='sample', remove_subset=True): """ Calculates similarity of all other terms to given term Parameters ---------- term : str level : str Sample or dataframe level, flattens all terms to one set of genes remove_subset : bool If any term is a subset of the other term, a score of 1 will be used instead of jaccard index. Returns ------- pd.DataFrame """ rest_of_df = self[~(self['term_name'] == term)].copy() first_genes = self.term_to_genes(term) if level == 'dataframe': vals = [[i, self.term_to_genes(i)] for i in rest_of_df['term_name'].unique()] else: vals = [[i, j.split(',')] for i, j in rest_of_df[['term_name', 'genes']].values] dist_m = [[i, jaccard_index(first_genes, j, remove_subset)] for i, j in vals] df = pd.DataFrame(dist_m, columns=['term_name', 'similarity_score']) df.sort_values('similarity_score', inplace=True, ascending=False) return df
[docs] def show_terms_below(self, term, level='dataframe', threshold=.7, remove_subset=True): """ Find terms that were removed by remove_redundant Parameters ---------- term : str level : str threshold : float remove_subset : bool Returns ------- EnrichmentResult """ temp_df = self.copy() # calculate similarity of term to all terms sim_terms = temp_df.find_similar_terms(term, remove_subset=remove_subset, level=level) # gather terms that are highly similar sim_terms = sim_terms.loc[sim_terms.similarity_score >= threshold] high_similar_terms = set(sim_terms.term_name.values) # this shows all terms that remained after filtering # Will be used to compare against term_kept = temp_df.remove_redundant( threshold=threshold, level=level, inplace=False, sort_by='combined_score' ) term_kept = set(term_kept.term_name.values) terms_removed = high_similar_terms.difference(term_kept) # Adding the original term to show similarity terms_removed.add(term) return temp_df.loc[temp_df.term_name.isin(terms_removed)]
[docs] def remove_redundant(self, threshold=0.75, verbose=False, level='sample', sort_by='combined_score', inplace=False): """ Calculate similarity between all term sets and removes redundant terms. Parameters ---------- threshold : float, default 0.75 verbose : bool, default False Print similarity scores and removed terms. level : {'sample', 'dataframe'}, default 'sample' Level to filter dataframe. 'sample' will pivot the dataframe and filter each group of 'sample_id' individually. 'dataframe' will merge all genes that share the same 'term_name'. sort_by : {'combined_score', 'rank', 'adj_p_value', 'n_genes'}, default 'combined_score' Keyword to sort the dataframe. The scoring starts at the top term and compares to all the lower terms. Options are inplace : bool Filter the dataframe in place or return filtered copy Returns ------- pandas.DataFrame """ if sort_by in ('rank', 'adj_p_value'): ascending = True else: ascending = False self.sort_values(sort_by, inplace=True, ascending=ascending) data_copy = self.copy() if 'sample_id' not in data_copy.columns or level == 'dataframe': to_keep = data_copy.unique_terms(threshold, verbose, level=level) else: to_keep = set() for i in sorted(data_copy['sample_id'].unique()): tmp = data_copy[data_copy['sample_id'] == i] to_keep.update( tmp.unique_terms(threshold, verbose, level=level) ) data_copy = data_copy[(data_copy['term_name'].isin(to_keep))] print("Number of rows went from {} to {}" "".format(len(self.term_name.unique()), len(data_copy.term_name.unique())) ) if inplace: self._update_inplace(data_copy) else: return data_copy
[docs] def unique_terms(self, threshold=0.75, verbose=False, level='dataframe'): """ Parameters ---------- threshold : float verbose : bool level : str, {'dataframe', 'each'} Returns ------- """ if level == 'dataframe': names = self['term_name'].unique() scores = self._get_distance_all() else: names = self['term_name'].values scores = self._get_distance_each() to_remove, to_keep = set(), set() n_dim = len(names) ind = 0 for i, term_1 in enumerate(names): if term_1 not in to_remove: to_keep.add(term_1) else: for j in range(n_dim): if i >= j: continue ind += 1 continue if verbose: print("Finding matches for {}".format(term_1)) for j, term_2 in enumerate(names): if i >= j: continue score = scores[ind] ind += 1 if score > threshold: to_remove.add(term_2) if verbose: print("\tScore for {} is {:.3f}".format(term_2, score)) if verbose: print("\t\tRemoving {}".format(term_2)) return to_keep
[docs] def dist_matrix(self, figsize=(8, 8), level='dataframe'): """ Create a distance matrix of all term similarity Parameters ---------- figsize : tuple Size of figure level : str, {'dataframe', 'each'} How to treats term_name to genes. Dataframe compresses all genes from all sample_ids into same term. 'each' treats each term_name individually. Returns ------- matplotlib.Figure """ mat, names = self.calc_dist(level) return cluster_distance_mat(mat, names, figsize)
[docs] def calc_dist(self, level='datafame'): if level == 'each': names = self['term_name'].values scores = self._get_distance_each() else: names = self['term_name'].unique() scores = self._get_distance_all() n_dim = len(names) mat = np.ones((n_dim, n_dim), dtype=float) ind = 0 for i in range(n_dim): for j in range(n_dim): if i >= j: continue mat[i, j] = scores[ind] mat[j, i] = scores[ind] ind += 1 return mat, names
def _get_distance_each(self): vals = [set(i.split(',')) for i in self['genes'].values] return list(map(_score, combinations(vals, 2))) def _get_distance_all(self): vals = [self.term_to_genes(i) for i in self['term_name'].unique()] return list(map(_score, combinations(vals, 2)))
def _score(vals): return jaccard_index(vals[0], vals[1]) def jaccard_index(set1, set2, remove_subset=True): """ Computes the similarity between two sets. https://en.wikipedia.org/wiki/Jaccard_index Parameters ---------- set1 : set set2 : set remove_subset : bool If a set is a subset of the other, return 1. Returns ------- index : float References ---------- .. [1] `Wikipedia entry for the Jaccard index <https://en.wikipedia.org/wiki/Jaccard_index>`_ """ union = len(set1.union(set2)) max_size = max(len(set1), len(set2)) if union == max_size and remove_subset: return 1. return float(len(set1.intersection(set2))) / float(union)