Source code for magine.plotting.species_plotting

import os
import re
import time
from textwrap import wrap

import matplotlib.pyplot as plt
import numpy as np
import pathos.multiprocessing as mp
import plotly.graph_objs as plotly_graph
import seaborn as sns
from plotly.offline import plot, iplot, init_notebook_mode

import magine.html_templates.html_tools as ht
from magine.data.tools import log2_normalize_df

fold_change = 'fold_change'
flag = 'significant'
exp_method = 'source'
p_val = 'p_value'
rna = 'rna_seq'
gene = 'gene'
protein = 'protein'
metabolites = 'metabolites'
species_type = 'species_type'
sample_id = 'sample_id'
identifier = 'identifier'
label_col = 'label'

cm = plt.get_cmap('jet')


[docs]def write_table_to_html(data, save_name='index', out_dir=None, run_parallel=False, exp_data=None, plot_type='matplotlib'): """ Creates a html table of plots of genes for each ontology term. Parameters ---------- data : magine.enrichment.enrichment_result.EnrichmentResult save_name : str name of html output file out_dir : str, optional output path for all plots run_parallel : bool Create plots in parallel exp_data : magine.data.ExperimentalData plot_type : str {'matplotlib', 'plotly'} """ list_of_terms = list(data['term_name'].unique()) fig_dict, to_remove = plot_genes_by_ont(data=data, list_of_terms=list_of_terms, save_name=save_name, out_dir=out_dir, exp_data=exp_data, run_parallel=run_parallel, plot_type=plot_type ) for i in fig_dict: data.loc[data['term_name'] == i, 'term_name'] = fig_dict[i] data = data[~data['term_name'].isin(to_remove)] html_out = save_name + '_filter' ht.write_filter_table(data, html_out)
[docs]def plot_genes_by_ont(data, list_of_terms, save_name, out_dir=None, exp_data=None, run_parallel=False, plot_type='plotly'): """ Creates a figure for each GO term in data BaseData should be a result of running calculate_enrichment. This function creates a plot of all proteins per term if a term is significant and the number of the reference set is larger than 5 and the total number of species measured is less than 100. Parameters ---------- data : pandas.DataFrame previously ran enrichment analysis list_of_terms : list_list save_name : str name to save file out_dir : str output path for file exp_data : magine.ExperimentalData data to plot run_parallel : bool To run in parallel using pathos.multiprocessing plot_type : str plotly or matplotlib Returns ------- out_array : dict dict where keys are pointers to figure locations """ if out_dir is not None: if not os.path.exists(out_dir): os.mkdir(out_dir) if not os.path.exists(os.path.join(out_dir, 'Figures')): os.mkdir(os.path.join(out_dir, 'Figures')) data = data.copy() figure_locations = {} plots_to_create = [] to_remove = set() if plot_type not in {'plotly', 'matplotlib'}: raise AssertionError("Please pass plotly or matplotlibn as plot_type") # filter data by significance and number of references if len(list_of_terms) == 0: print("No significant ontology terms!!!") return figure_locations, to_remove # here we are going to iterate through all sig GO terms and create # a list of plots to create. For the HTML side, we need to point to # a location _data = exp_data.data.copy() # create plot of genes over time for n, i in enumerate(list_of_terms): # want to plot all species over time index = data['term_name'] == i name = data[index]['term_name'].unique() if len(name) > 0: name = name[0] gene_set = set() genes = data[index]['genes'] for g in genes: if isinstance(g, list): each = g else: each = g.split(',') gene_set.update(set(each)) if plot_type == 'matplotlib': # too many genes isn't helpful on plots, so skip them if len(gene_set) > 100: figure_locations[i] = '<a>{0}</a>'.format(name) continue local_save_name = os.path.join('Figures', '{0}_{1}'.format(n, save_name)) local_save_name = local_save_name.replace(':', '') out_point = '<a href="{0}.html">{1}</a>'.format(local_save_name, name) figure_locations[i] = out_point title = "{0} : {1}".format(str(i), name) local_df = _data.loc[_data[identifier].isin(list(gene_set))].copy() p_input = [local_df, list(gene_set), local_save_name, out_dir, title, plot_type] plots_to_create.append(p_input) print("Starting to create plots for each term") _make_plots(plots_to_create, plot_species, run_parallel) return figure_locations, to_remove
[docs]def plot_dataframe(exp_data, html_filename, out_dir='proteins', plot_type='plotly', run_parallel=False): """ Creates Parameters ---------- exp_data : magine.BaseData. html_filename : str out_dir: str, path Directory that will contain all proteins plot_type : str plotly or matplotlib output run_parallel : bool create plots in parallel Returns ------- """ if not os.path.exists(out_dir): os.mkdir(out_dir) local_data = exp_data.copy() species_to_plot = local_data[identifier].unique() fig_loc = {} plots = [] suffix = 'html' if plot_type == 'plotly' else 'pdf' for i in species_to_plot: save_name = re.sub('[/_.]', '', i) plots.append([local_data, [i], save_name, out_dir, i, plot_type]) n = '<a href="{0}/{1}.{2}">{1}</a>'.format(out_dir, save_name, suffix) fig_loc[i] = n _make_plots(plots, plot_species, run_parallel) # Place a link to the species for each key for key, value in fig_loc.items(): local_data.loc[exp_data[identifier] == key, identifier] = value cols = [identifier, label_col, fold_change, p_val, sample_id, exp_method, flag] local_data = local_data[cols] ht.write_filter_table(local_data, html_filename)
def _make_plots(plots_to_make, plot_func, parallel=False): for i, _ in enumerate(plots_to_make): plots_to_make[i].append('pdf') plots_to_make[i].append(True) if parallel: st2 = time.time() pool = mp.Pool() # lambda a: function(a[0], **a[1]), arguments pool.map_async(lambda a: plot_func(*a), plots_to_make) pool.close() pool.join() end2 = time.time() print("parallel time = {}".format(end2 - st2)) print("Done creating plots for each GO term") else: st1 = time.time() list(map(lambda a: plot_func(*a), plots_to_make)) end1 = time.time() print("sequential time = {}".format(end1 - st1)) plt.close('all')
[docs]def plot_species(df, species_list=None, save_name='test', out_dir=None, title=None, plot_type='plotly', image_format='pdf', close_plots=False): """ Parameters ---------- df: pandas.DataFrame magine formatted dataframe species_list: list List of genes to be plotter save_name: str Filename to be saved as out_dir: str Path for output to be saved title: str Title of plot, useful when list of genes corresponds to a GO term plot_type : str Use plotly to generate html output or matplotlib to generate pdf image_format : str pdf or png, only used if plot_type="matplotlib" close_plots : bool Close plot after making, use when creating lots of plots in parallel. Returns ------- """ ldf = df.copy() if out_dir is not None: if not os.path.exists(out_dir): os.mkdir(out_dir) # gather x axis points x_points = sorted(ldf[sample_id].unique()) if len(x_points) == 0: return if isinstance(x_points[0], np.float): x_point_dict = {i: x_points[n] for n, i in enumerate(x_points)} else: x_point_dict = {i: n for n, i in enumerate(x_points)} if species_list is not None: ldf = ldf.loc[ldf[identifier].isin(species_list)].copy() ldf = log2_normalize_df(ldf, column=fold_change) n_plots = len(ldf[identifier].unique()) num_colors = len(ldf[label_col].unique()) color_list = sns.color_palette("tab20", num_colors) if plot_type == 'matplotlib': fig = plt.figure() ax = fig.add_subplot(111) ax.set_prop_cycle(plt.cycler('color', color_list)) colors = enumerate(color_list) plotly = [] names_list = [] total_counter = 0 for name, j in ldf.groupby(identifier): index_counter = 0 for n, m in j.groupby(label_col): x = np.array(m[sample_id]) if len(x) < 1: continue y = np.array(m['fold_change']) sig_flag = np.array(m[flag]) index = np.argsort(x) x = x[index] y = y[index] s_flag = sig_flag[index] # x values with scaled values (only changes things if non-float # values are used for sample_id x_index = np.array([x_point_dict[ind] for ind in x]) index_counter += 1 total_counter += 1 # create matplotlib plot if plot_type == 'matplotlib': label = "\n".join(wrap(n, 40)) p = ax.plot(x_index, y, '.-', label=label) if len(s_flag) != 0: color = p[0].get_color() ax.plot(x_index[s_flag], y[s_flag], '^', color=color) # create plotly plot elif plot_type == 'plotly': c = next(colors)[1] plotly.append(_ploty_graph(x_index, y, n, n, c)) if len(s_flag) != 0: index_counter += 1 total_counter += 1 plotly.append(_ploty_graph(x_index[s_flag], y[s_flag], n, n, c, marker='x-open-dot')) names_list.append([name, index_counter]) if plot_type == 'matplotlib': lgd = _format_mpl(ax, x_point_dict, x_points) if save_name is not None: tmp_savename = "{}.{}".format(save_name, image_format) if out_dir is not None: tmp_savename = os.path.join(out_dir, tmp_savename) plt.savefig(tmp_savename, bbox_extra_artists=(lgd,), bbox_inches='tight') if close_plots: plt.close(fig) else: return fig elif plot_type == 'plotly': fig = _create_plotly(total_counter, n_plots, names_list, x_point_dict, title, x_points, plotly) if save_name: _save_ploty_output(fig, out_dir, save_name) else: init_notebook_mode(connected=True) iplot(fig)
def _format_mpl(ax, x_point_dict, x_points): ax.set_xlim(min(x_point_dict.values()) - 2, max(x_point_dict.values()) + 2) ax.set_xticks(sorted(x_point_dict.values())) ax.set_xticklabels(x_points, rotation=90) plt.ylabel('log$_2$ Fold Change') plt.axhline(y=np.log2(1.5), linestyle='--') plt.axhline(y=-np.log2(1.5), linestyle='--') handles, labels = ax.get_legend_handles_labels() lgd = ax.legend(handles, labels, loc='best', ncol=3, bbox_to_anchor=(1.01, 1.0)) return lgd def _create_plotly(total_counter, n_plots, names_list, x_point_dict, title, x_points, plotly_list): true_list = [True] * total_counter scroll_list = [dict(args=['visible', true_list], label='All', method='restyle')] prev = 0 # making all false except group defined by protein name for i in range(n_plots): t_row = [False] * total_counter for j in range(prev, prev + names_list[i][1]): t_row[j] = True prev += names_list[i][1] scroll = dict(args=['visible', t_row], label=names_list[i][0], method='restyle') scroll_list.append(scroll) update_menu = list([dict(x=-0.05, y=1, yanchor='top', buttons=scroll_list, )]) ticks = np.sort(list(x_point_dict.values())) min_tick = np.min(ticks) max_tick = np.max(ticks) layout = plotly_graph.Layout( title=title, showlegend=True, xaxis=dict(title='Sample index', range=[min_tick, max_tick], showticklabels=True, ticktext=x_points, tickmode='array', tickvals=ticks, ), yaxis=dict(title='log2fc'), hovermode="closest", updatemenus=update_menu ) return plotly_graph.Figure(data=plotly_list, layout=layout) def _save_ploty_output(fig, out_dir, save_name): tmp_savename = "{}.html".format(save_name) if out_dir is not None: tmp_savename = os.path.join(out_dir, tmp_savename) x = plot(fig, filename=tmp_savename, auto_open=False, include_plotlyjs=False, output_type='div') ht.format_ploty(x, tmp_savename) def _ploty_graph(x, y, label, enum, color, marker='circle'): """ Creates a single scatter plot Parameters ---------- x : list_like y : list_like label : str enum : int color : str marker : str Returns ------- """ l_color = 'rgba({},{},{},1.)'.format(color[0], color[1], color[2]) if marker != 'circle': mode = 'markers' show = False size = 12 else: mode = 'lines+markers' show = True size = 8 legend = 'group_{}'.format(enum) g = plotly_graph.Scatter( x=x, y=y, hoveron='points', name=label, visible=True, mode=mode, legendgroup=legend, showlegend=show, line=dict(color=l_color), marker=dict(symbol=marker, size=size, color=l_color), ) return g