Source code for magine.data.experimental_data

import os
import subprocess
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import table

from magine.data.base import BaseData
from magine.data.tools import log2_normalize_df
from magine.plotting import volcano_plots as v_plot
from magine.plotting.species_plotting import plot_dataframe, plot_species

# pandas.set_option('display.max_colwidth', -1)
# column definitions

fold_change = 'fold_change'
flag = 'significant'
exp_method = 'source'
p_val = 'p_value'
rna = 'rna_seq'
gene = 'gene'
protein = 'protein'
metabolites = 'metabolites'
species_type = 'species_type'
sample_id = 'sample_id'
identifier = 'identifier'
label = 'label'
valid_cols = [fold_change, flag, p_val, species_type, sample_id]


def load_data_csv(file_name, **kwargs):
    """ Deprecated; use :class:`load_data` instead. """

    warnings.warn("load_data_csv will be removed in a future "
                  "version of MAGINE. Use load_data instead.",
                  DeprecationWarning, stacklevel=2)
    return load_data(file_name, **kwargs)


def load_data(file_name, **kwargs):
    """ Load data into EnrichmentResult data class

    Parameters
    ----------
    file_name : str
    kwargs :
        Flags to pass to pandas.

    Returns
    -------
    df : EnrichmentResult

    """
    df = pd.read_csv(file_name, **kwargs)
    df = df[df[fold_change].notnull()]
    return ExperimentalData(df)


[docs]class Sample(BaseData): """ Provides tools for subsets of data types """ def __init__(self, *args, **kwargs): super(Sample, self).__init__(*args, **kwargs) # self.drop_duplicates(inplace=True) self._index = identifier self._identifier = identifier self._value_name = fold_change self._sample_id_name = sample_id self._label = label self._up = None self._down = None self._sig = None @property def _constructor(self): return Sample @property def exp_methods(self): """ List of sample_ids in data""" return sorted(set(self[exp_method].values)) @property def sample_ids(self): """ List of sample_ids in data""" return sorted(set(self[sample_id].values)) @property def up(self): """return up regulated species""" return self.loc[self[flag] & (self[fold_change] > 0)] @property def down(self): """return down regulated species""" return self.loc[self[flag] & (self[fold_change] < 0)] @property def id_list(self): """ Set of species identifiers """ return set(self[self._identifier].values) @property def label_list(self): """ Set of species labels """ return set(self[self._label].values) @property def up_by_sample(self): """List of up regulated species by sample""" return [self.loc[self[sample_id] == i].up.id_list for i in self.sample_ids] @property def down_by_sample(self): """List of down regulated species by sample""" return [self.loc[self[sample_id] == i].down.id_list for i in self.sample_ids] @property def by_sample(self): """List of significantly flagged species by sample""" return [self.loc[self[sample_id] == i].id_list for i in self.sample_ids]
[docs] def subset(self, species=None, index='identifier', sample_ids=None, exp_methods=None): """ Parameters ---------- species : list, str List of species to create subset dataframe from index : str Index to filter based on provided 'species' list sample_ids : str, list List or string to filter sample exp_methods : str, list List or string to filter sample Returns ------- magine.data.experimental_data.Species """ df = self.copy() if isinstance(species, str): df = df.loc[df[index].str.contains(species)] elif isinstance(species, (list, tuple, set)): df = df.loc[df[index].isin(species)] if sample_ids is not None: if isinstance(species, str): df = df.loc[df[sample_id].str.contains(sample_ids)] else: df = df.loc[df[sample_id].isin(sample_ids)] if exp_methods is not None: if isinstance(species, str): df = df.loc[df[exp_method].str.contains(exp_methods)] else: df = df.loc[df[exp_method].isin(exp_methods)] return df
[docs] def plot_pie_sig_ratio(self, save_name=None, ax=None, fig=None, figsize=None): """ Parameters ---------- save_name : str ax : matplotlib.axes, optional fig : matplotlib.figure figsize : tuple Size of figure Returns ------- """ x = len(self.id_list) y = len(self.sig.id_list) total = x + y if fig is None and ax is None: if figsize is None: figsize = (3, 3) fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) wedges, texts, autotexts = ax.pie( [x, y], explode=(0.05, 0.05), textprops={'fontsize': 16}, autopct=lambda p: '{:.0f}'.format(p * total / 100), shadow=True, startangle=140 ) plt.setp(autotexts, size=20) plt.axis('equal') if save_name is not None: plt.savefig('{}.png'.format(save_name), dpi=300, bbox_inches='tight') return fig
[docs] def volcano_plot(self, save_name=None, out_dir=None, sig_column=False, p_value=0.1, fold_change_cutoff=1.5, x_range=None, y_range=None): """ Create a volcano plot of data Parameters ---------- save_name: str name to save figure out_dir: str, directory Location to save figure sig_column: bool, optional If to use significant flags of data p_value: float, optional Criteria for significant fold_change_cutoff: float, optional Criteria for significant y_range: array_like upper and lower bounds of plot in y direction x_range: array_like upper and lower bounds of plot in x direction Returns ------- matplotlib.Figure """ fig = v_plot.volcano_plot(self, save_name=save_name, out_dir=out_dir, sig_column=sig_column, p_value=p_value, fold_change_cutoff=fold_change_cutoff, x_range=x_range, y_range=y_range) return fig
[docs] def volcano_by_sample(self, save_name=None, p_value=0.1, out_dir=None, fold_change_cutoff=1.5, y_range=None, x_range=None, sig_column=False): """ Creates a figure of subplots of provided experimental method Parameters ---------- save_name: str name to save figure out_dir: str, directory Location to save figure sig_column: bool, optional If to use significant flags of data p_value: float, optional Criteria for significant fold_change_cutoff: float, optional Criteria for significant y_range: array_like upper and lower bounds of plot in y direction x_range: array_like upper and lower bounds of plot in x direction Returns ------- """ data = self.copy() n_sample = np.sort(data[sample_id].unique()) if len(n_sample) > 8: n_cols = 3 else: n_cols = 2 n_rows = int(np.rint(np.rint(len(n_sample) / float(n_cols)))) if n_cols * n_rows < len(n_sample): if n_cols >= n_rows: n_rows += 1 else: n_cols += 1 fig = plt.figure(figsize=(4 * n_rows, 3 * n_cols)) for n, i in enumerate(n_sample): sample = data[data[sample_id] == i].copy() sample = sample.dropna(subset=[p_val]) sample = sample[np.isfinite(sample[fold_change])] sample = sample.dropna(subset=[fold_change]) sec_0, sec_1, sec_2 = v_plot.create_mask(sample, sig_column, p_value, fold_change_cutoff) ax = fig.add_subplot(n_rows, n_cols, n + 1) ax.set_title(i) v_plot.add_volcano_plot(ax, sec_0, sec_1, sec_2) if not sig_column: fc = np.log2(fold_change_cutoff) log_p_val = -1 * np.log10(p_value) ax.axvline(x=fc, linestyle='--') ax.axvline(x=-1 * fc, linestyle='--') ax.axhline(y=log_p_val, linestyle='--') if y_range is not None: ax.set_ylim(y_range[0], y_range[1]) if x_range is not None: ax.set_xlim(x_range[0], x_range[1]) fig.tight_layout() if save_name is not None: v_plot.save_plot(fig, save_name=save_name, out_dir=out_dir) return fig
[docs] def plot_species(self, species_list=None, subset_index=None, save_name=None, out_dir=None, title=None, plot_type='plotly', image_format='png'): """ Create scatter plot of species list Parameters ---------- species_list : list list of compounds subset_index : list Column to filter based on species_list save_name : str Name of html output file out_dir : str Location to place plots title : str Title for HTML page plot_type : str Type of plot outputs, can be "plotly" or "matplotlib" image_format : str pdf or png, only used if plot_type="matplotlib" Returns ------- matplotlib.Figure or plotly.Figure """ df = self.copy() if species_list is not None: if subset_index is None: subset_index = self._index df = df.subset(species_list, index=subset_index) return plot_species( df, save_name=save_name, out_dir=out_dir, title=title, plot_type=plot_type, image_format=image_format )
[docs] def plot_all(self, html_file_name, out_dir='out', plot_type='plotly', run_parallel=False): """ Creates a plot of all metabolites Parameters ---------- html_file_name : str filename to save html of all plots out_dir: str, path Directory that will contain all proteins plot_type : str plotly or matplotlib output run_parallel : bool Create the plots in parallel Returns ------- """ plot_dataframe(self, html_filename=html_file_name, out_dir=out_dir, plot_type=plot_type, run_parallel=run_parallel)
[docs] def plot_histogram(self, save_name=None, y_range=None, out_dir=None): """ Plots a histogram of data Parameters ---------- save_name: str Name of figure out_dir: str, path Path to location to save figure y_range: array_like range of data Returns ------- """ data = self.copy() data = data.dropna(subset=[p_val]) data = data[np.isfinite(data[fold_change])] data = data.dropna(subset=[fold_change]) tmp = np.array(log2_normalize_df(data, fold_change)[fold_change]) fig = plt.figure() ax = fig.add_subplot(111) ax.hist(tmp, 50, color='gray') if y_range is not None: plt.xlim(y_range[0], y_range[1]) ax.set_yscale('log', basey=10) ax.set_xlabel('log$_2$ Fold Change', fontsize=16) ax.set_ylabel('Count', fontsize=16) fig.tight_layout() if save_name is not None: v_plot.save_plot(fig, save_name, out_dir) return fig
[docs]class ExperimentalData(object): """ Manages all experimental data """ def __init__(self, data_file): """ Parameters ---------- data_file : str, pandas.DataFrame Name of file, generally csv. If provided a str, the file will be read in as a pandas.DataFrame """ if isinstance(data_file, pd.DataFrame): df = data_file.copy() else: df = pd.read_csv(data_file, parse_dates=False, low_memory=False) df.reset_index(drop=True, inplace=True) df.drop_duplicates(inplace=True) for i in valid_cols: if i not in df.dtypes: print("{} not in columns.".format(i)) self.data = BaseData(df) self._index = 'identifier' self.__proteins = None self.__genes = None self.__species = None self.__rna = None self.__compounds = None for i in self.exp_methods: self.__setattr__(i, Sample( self.data.loc[self.data[exp_method] == i])) for i in self.sample_ids: self.__setattr__(i, Sample( self.data.loc[self.data[sample_id] == i])) def __setattr__(self, name, value): super(ExperimentalData, self).__setattr__(name, value) def __getitem__(self, name): return super(ExperimentalData, self).__getattribute__(name) @property def genes(self): """ All data tagged with gene Includes protein and RNA. Returns ------- """ if self.__genes is None: tmp = self.data.copy() tmp = tmp.loc[tmp[species_type].isin([protein, gene])] self.__genes = Sample(tmp) return self.__genes @property def proteins(self): """ Protein level data Tagged with "gene" identifier that is not RNA Returns ------- """ if self.__proteins is None: tmp = self.data.copy() tmp = tmp.loc[(self.data[species_type].isin([protein, gene])) & ~(tmp[exp_method] == rna)] self.__proteins = Sample(tmp) return self.__proteins @property def rna(self): """ RNA level data Tagged with "RNA" Returns ------- """ if self.__rna is None: tmp = self.data.copy() tmp = tmp.loc[tmp[exp_method] == rna] self.__rna = Sample(tmp) return self.__rna @property def compounds(self): """ Only compounds in data Returns ------- Sample """ if self.__compounds is None: tmp = self.data.copy() tmp = tmp.loc[tmp[species_type] == metabolites] self.__compounds = Sample(tmp) return self.__compounds @property def species(self): """ Returns data in Sample format Returns ------- Sample """ if self.__species is None: self.__species = Sample(self.data.copy()) return self.__species @property def exp_methods(self): """ List of source columns """ return list(self.data[exp_method].unique()) @property def sample_ids(self): """ List of sample_ids """ return sorted(list(self.data[sample_id].unique()))
[docs] def subset(self, species, index='identifier'): """ Parameters ---------- species : list, str List of species to create subset dataframe from index : str Index to filter based on provided 'species' list Returns ------- magine.data.experimental_data.Species """ df = self.species.copy() if isinstance(species, str): df = df.loc[df[index].str.contains(species)] else: df = df.loc[df[index].isin(species)] return df
[docs] def get_measured_by_datatype(self): """ Returns dict of species per data type Returns ------- dict """ return get_measured_by_datatype(self)
[docs] def create_summary_table(self, sig=False, index=identifier, save_name=None, plot=False, write_latex=False): """ Creates a summary table of data. Parameters ---------- sig: bool Flag to summarize significant species only save_name: str Name to save csv and .tex file index: str Index for counts plot: bool If you want to create a plot of the table write_latex: bool Create latex file of table Returns ------- pandas.DataFrame """ return create_table_of_data(self, sig=sig, index=index, save_name=save_name, plot=plot, write_latex=write_latex)
[docs] def volcano_analysis(self, out_dir, use_sig_flag=True, p_value=0.1, fold_change_cutoff=1.5): """ Creates a volcano plot for each experimental method Parameters ---------- out_dir: str, path Path to where the output figures will be saved use_sig_flag: bool Use significant flag of data p_value: float, optional p value criteria for significant Will not be used if use_sig_flag fold_change_cutoff: float, optional fold change criteria for significant Will not be used if use_sig_flag Returns ------- """ if not os.path.exists(out_dir): os.mkdir(out_dir) for i in self.exp_methods: self[i].volcano_plot( i, out_dir=out_dir, sig_column=use_sig_flag, p_value=p_value, fold_change_cutoff=fold_change_cutoff )
template = r''' \documentclass[12pt, letterpaper]{{article}} \usepackage{{booktabs}} \usepackage{{geometry}} \usepackage{{pdflscape}} %\usepackage{{nopageno}} \geometry{{ papersize={{4.444in,12.681in}},total={{2.8in,6.8in}} }} \begin{{document}} \begin{{landscape}} \begin{{table}} {} \end{{table}} \end{{landscape}} \end{{document}} ''' def get_measured_by_datatype(data): """ Get unique list of species for each 'source' label in data. Parameters ---------- data : ExperimentalData Returns ------- measured, sig_measured : dict, dict Dictionaries where keys are 'source' and values are sets of ids. """ measured = dict() sig_measured = dict() for i in data.exp_methods: sig_measured[i] = set(data[i].sig.id_list) measured[i] = set(data[i].id_list) return measured, sig_measured def create_table_of_data(data, sig=False, index='identifier', save_name=None, plot=False, write_latex=False): """ Creates a summary table of data. Parameters ---------- data : ExperimentalData sig: bool Flag to summarize significant species only save_name: None, str Name to save csv and .tex file index: str Index to create counts plot: bool If you want to create a plot of the table write_latex: bool Create latex file of table Returns ------- pandas.DataFrame """ if sig: data_copy = data.species.sig.copy() else: data_copy = data.species.copy() count_table = data_copy.pivot_table(values=index, index=exp_method, columns=sample_id, fill_value=np.nan, aggfunc=lambda x: x.dropna().nunique()) # This just makes sure things are printed as ints, not floats for i in count_table.columns: count_table[i] = count_table[i].fillna(-1).astype(int).replace(-1, '-') unique_col = {} for i in data.exp_methods: if sig: unique_col[i] = len(set(data[i].sig[index].values)) else: unique_col[i] = len(set(data[i][index].values)) count_table['Total Unique Across'] = pd.Series(unique_col, index=count_table.index) if plot: ax = plt.subplot(111, frame_on=False) table(ax, count_table, loc='center') ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) plt.tight_layout() if save_name is not None: plt.savefig('{}.png'.format(save_name), dpi=300, bbox_inches='tight') if save_name is not None: count_table.to_csv('{}.csv'.format(save_name)) if write_latex and save_name is not None: _write_to_latex(pd_table=count_table, save_name=save_name) return count_table def _write_to_latex(pd_table, save_name): filename = '{0}.tex'.format(save_name) with open(filename, 'wt') as f: st = pd_table.to_latex( column_format='*{{{}}}{{c}}'.format(str(pd_table.shape[1] + 2))) f.write(template.format(st)) if _which('pdflatex'): print('Compiling table') with open(os.devnull, "w") as fnull: subprocess.call(['pdflatex', filename], stderr=subprocess.STDOUT, stdout=fnull) if _which('pdfcrop'): pdffile = '{0}.pdf'.format(save_name) subprocess.call(['pdfcrop', pdffile, pdffile], stderr=subprocess.STDOUT, stdout=fnull) if _which('convert'): tmp_png_name = '{0}.png'.format(save_name) subprocess.call(['convert', '-density', '300', pdffile, '-quality', '90', tmp_png_name], stderr=subprocess.STDOUT, stdout=fnull) else: print('Install pdflatex to compile to pdf or png\n' 'You can use the csv file for use in outside tools') def _which(program): def _is_exe(filepath): return os.path.isfile(filepath) and os.access(filepath, os.X_OK) fpath, fname = os.path.split(program) if fpath: if _is_exe(program): return program elif _is_exe(program + '.exe'): return program + '.exe' else: for path in os.environ["PATH"].split(os.pathsep): path = path.strip('"') exe_file = os.path.join(path, program) if _is_exe(exe_file): return exe_file return None