Source code for pymgpipe.main

import sys
import os
import gc
import tqdm
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import time
import skbio 
import seaborn as sns 
from cobra import Configuration
from scipy.spatial.distance import squareform, pdist
from pathlib import Path
from multiprocessing import Pool
from functools import partial
from .modeling import build
from .diet import add_diet_to_model
from .io import load_cobra_model, write_lp_problem, write_cobra_model, suppress_stdout
from .utils import load_dataframe, remove_reverse_vars
from .coupling import add_coupling_constraints
from .metrics import _compute_diversity_metrics
from .logger import logger

cobra_config = Configuration()
cobra_config.lower_bound = -1000
cobra_config.upper_bound = 1000


[docs] def build_models( coverage_file, taxa_dir, solver="gurobi", threads=int(os.cpu_count() / 2), samples=None, parallel=True, lp_type=".mps", cobra_type=".xml", out_dir="./", coupling_constraints=True, diet_fecal_compartments=False, remove_reverse_vars_from_lp=False, hard_remove=False, abundance_threshold=1e-6, diet=None, vaginal=False, essential_metabolites=None, micronutrients=None, force_uptake=True, diet_threshold=0.8, compress=True, compute_metrics=True, force=False, sample_prefix='mc' ): """Build community COBRA models using mgpipe-like compartments and constraints. This function is pymgpipe's main model building function, and can be used to build models for either one or multiple samples. Args: coverage_file (pandas.DataFrame | str): Abundance matrix with taxa as rows and samples as columns taxa_dir (str): Directory containing individual strain/species taxa models (file names corresponding to index of coverage matrix) out_dir (str): Directory to save output of this function (models, LP problems, etc.), defaults to cwd diet_fecal_compartments (bool): Build models with mgpipe's diet/fecal compartmentalization, defaults to False coupling_constraints (bool): Add mgpipe's abundance coupling constraints to internal reactions, defaults to True remove_reverse_vars_from_lp (bool): Remove reverse variables that are added into COBRApy models hard_remove (bool): If set to True, variables will be removed entirely from models, which can be a time consuming process. Having it set to False will `remove` them by setting their bounds to 0. diet (str): Name of diet or file you want to impose on models (can be one diet or personalized) vaginal (bool): Impose vaginal diet constraints to support vaginal diet essential_metabolites (list): List of metabolites to include as essential within the diet micronutrients (list): List of metabolites to include as micronutrients within diet force_uptake (bool): Force minimum uptake of nutrients from diet diet_threshold (float): Threshold on imposed diet uptake, i.e. lb < metabolite < (threshold * lb) solver (str): LP solver (gurobi or cplex) used to solve models, defaults to gurobi parallel (bool): Samples will be built in parallel if set to True threads (int): Number of threads to use if building in parallel lp_type (str): File type for LP problem (either .mps or .lp), defaults to .mps cobra_type (str): File type for COBRA model (.xml, .mat, .json), defaults to .xml compress (bool): Models and LP problems will be saved as compressed files if set to True, defaults to True compute_metrics (bool): Compute diversity metrics for built models, defaults to True Notes: COBRA models written to *out_dir/models/*\n LP problems written to *out_dir/problems/*\n All modifications done at the level of the LP (i.e. coupling constraints, diet) are not saved when writing COBRA models to file. For this reason, it is always recommended to work with LP models in the `problems/` folder. """ start = time.time() cobra_config.solver = solver gc.disable() Path(out_dir).mkdir(exist_ok=True) taxa_dir = taxa_dir + "/" if taxa_dir[-1] != "/" else taxa_dir out_dir = out_dir + "/" if out_dir[-1] != "/" else out_dir model_dir = out_dir + "models/" problem_dir = out_dir + "problems/" Path(model_dir).mkdir(exist_ok=True) Path(problem_dir).mkdir(exist_ok=True) formatted = _format_coverage_file(coverage_file, out_dir, sample_prefix) samples_to_run = list(formatted.columns) taxa = list(formatted.index) print( "Found coverage file with %s samples and %s unique taxa" % (len(samples_to_run), len(taxa)) ) if samples is not None: samples_to_run = samples if isinstance(samples, list) else [samples] if len(samples_to_run) <= 1: parallel = False threads = os.cpu_count() - 1 if threads == -1 else threads threads = min(threads, len(samples_to_run)) print("Building %s models..." % len(samples_to_run)) print("\n----------------Parameters----------------") print("Diet/fecal compartments- %s" % str(diet_fecal_compartments).upper()) print("Coupling constraints- %s" % str(coupling_constraints).upper()) print("Parallel- %s" % str(parallel).upper()) print("Threads- %s" % str(threads).upper()) print("Solver- %s" % solver.upper()) print("LP type- %s" % str(lp_type.split(".")[1]).upper()) print("COBRA type- %s" % str(cobra_type.split(".")[1]).upper()) print("compress- %s" % str(compress).upper()) print("Output directory- %s" % str(out_dir).upper()) _func = partial( _inner, formatted, taxa_dir, solver, model_dir, problem_dir, lp_type, cobra_type, coupling_constraints, diet_fecal_compartments, remove_reverse_vars_from_lp, hard_remove, diet, vaginal, essential_metabolites, micronutrients, force_uptake, diet_threshold, abundance_threshold, compress, compute_metrics, force ) if parallel: p = Pool(threads, initializer=_mute) p.daemon = False metrics = list( tqdm.tqdm(p.imap(_func, samples_to_run), total=len(samples_to_run)) ) p.close() p.join() else: metrics = tqdm.tqdm(list(map(_func, samples_to_run)), total=len(samples_to_run)) if compute_metrics: try: abundance_df = pd.DataFrame({m['sample']:m['reaction_abundance'] for m in metrics}) abundance_df.to_csv(out_dir+'reaction_abundance.csv') abundance_df[abundance_df > 0] = 1 abundance_df.to_csv(out_dir+'reaction_content.csv') unique_reactions = [(len(m['taxa']),len(m['unique_reactions'])) for m in metrics] plt.scatter(*zip(*unique_reactions)) plt.xlabel('# Taxa', fontsize=12) plt.ylabel('# Unique Reactions', fontsize=12) plt.xticks(np.arange(0,max([r[0] for r in unique_reactions])+1,1)) plt.title('Metabolic Diversity') plt.savefig(out_dir+'metabolic_diversity.png') plt.clf() # PCoA plot of reaction presence abundance_df = abundance_df.fillna(0).T distance_matrix = squareform(pdist(abundance_df, 'braycurtis')) my_pcoa = skbio.stats.ordination.pcoa(distance_matrix) pcoa_points = my_pcoa.samples pcoa_points.set_index(abundance_df.index,inplace=True) ax = sns.scatterplot(data=pcoa_points, x='PC1', y='PC2', hue=pcoa_points.index, ax=plt.gca()) str1 = 'PC1 ' + str(round(my_pcoa.proportion_explained[0], 3) * 100) + '% of explained variance' str2 = 'PC2 ' + str(round(my_pcoa.proportion_explained[1], 3) * 100) + '% of explained variance' ax.set(xlabel=str1, ylabel=str2, title='PCoA of reaction presence (braycurtis)') plt.legend([],[], frameon=False) for line in range(0,pcoa_points.shape[0]): ax.text(pcoa_points['PC1'][line]+0.001, pcoa_points['PC2'][line]+0.001, pcoa_points.index[line], horizontalalignment='left', size='small', color='black', weight='light') plt.savefig(out_dir+'reaction_pcoa.png') except Exception as e: logger.warn('Ran into problem while saving metrics...skipping this step!\n%s'%e) pass print("-------------------------------------------------------") logger.info("Finished building %s models and associated LP problems!" % len(metrics)) logger.info('Process took %s minutes to run...'%round((time.time()-start)/60,3))
def _inner( coverage_df, taxa_dir, solver, model_dir, problem_dir, lp_type, cobra_type, coupling_constraints, diet_fecal_compartments, remove_reverse_vars_from_lp, hard_remove, diet, vaginal, essential_metabolites, micronutrients, force_uptake, diet_threshold, abundance_threshold, compress, compute_metrics, force, sample_label, ): model_out = ( model_dir + "%s.%s" % (sample_label, cobra_type.split(".")[1]) if not compress else model_dir + sample_label + ".xml.gz" ) lp_out = problem_dir + "%s.%s" % (sample_label, lp_type.split(".")[1]) pymgpipe_model = None metrics = None if not force and os.path.exists(model_out): pymgpipe_model = load_cobra_model(model_out) else: with suppress_stdout(): pymgpipe_model = build( sample=sample_label, abundances=coverage_df, taxa_directory=taxa_dir, threshold=abundance_threshold, diet_fecal_compartments=diet_fecal_compartments, solver=solver ) force = True write_cobra_model(pymgpipe_model, model_out) if compute_metrics: metrics = _compute_diversity_metrics(pymgpipe_model) if metrics is None or len(metrics)==0: logger.warning('Unable to compute diversity metrics for %s'%pymgpipe_model.name) if force or (not os.path.exists(lp_out) and not os.path.exists(lp_out+'.gz') and not os.path.exists(lp_out+'.7z')): # ----- START OPTLANG MODIFICATIONS ----- if remove_reverse_vars_from_lp: try: logger.info('Removing variables from %s...'%sample_label) remove_reverse_vars(pymgpipe_model,hard_remove) except Exception: logger.warning('Failed to remove reverse variables!') if diet is not None: add_diet_to_model(pymgpipe_model, diet, force_uptake, essential_metabolites, micronutrients, vaginal, diet_threshold) if coupling_constraints: try: logger.info('Adding coupling constraints to %s...'%sample_label) add_coupling_constraints(pymgpipe_model) except Exception: logger.warning("Failed to add coupling constraints!") write_lp_problem(pymgpipe_model, out_file=lp_out, compress=compress, force=True) else: logger.info('Skipping %s because LP problem already exists!'%sample_label) del pymgpipe_model gc.collect() return metrics def _format_coverage_file(coverage_file, out_dir = './', sample_prefix = None): coverage = load_dataframe(coverage_file) if sample_prefix is None: return coverage conversion_file_path = out_dir + "sample_label_conversion.csv" try: sample_conversion_dict = pd.read_csv(conversion_file_path, index_col=0).iloc[:, 0].to_dict() if set(sample_conversion_dict.keys()) != set(coverage.columns): raise Exception( "Provided label conversion file %s does not provide labels for all samples!" % conversion_file_path ) except: sample_conversion_dict = { v: sample_prefix + str(i + 1) for i, v in enumerate(sorted(coverage.columns)) } pd.DataFrame({"conversion": sample_conversion_dict}).to_csv(conversion_file_path) return coverage.rename(columns=sample_conversion_dict) def _mute(): sys.stdout = open(os.devnull, "w")