import sys
import os
import gc
import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import skbio
import seaborn as sns
from cobra import Configuration
from scipy.spatial.distance import squareform, pdist
from pathlib import Path
from multiprocessing import Pool
from functools import partial
from .modeling import build
from .diet import add_diet_to_model
from .io import load_cobra_model, write_lp_problem, write_cobra_model, suppress_stdout
from .utils import load_dataframe, remove_reverse_vars
from .coupling import add_coupling_constraints
from .metrics import _compute_diversity_metrics
from .logger import logger
cobra_config = Configuration()
cobra_config.lower_bound = -1000
cobra_config.upper_bound = 1000
[docs]
def build_models(
coverage_file,
taxa_dir,
solver="gurobi",
threads=int(os.cpu_count() / 2),
samples=None,
parallel=True,
lp_type=".mps",
cobra_type=".xml",
out_dir="./",
coupling_constraints=True,
diet_fecal_compartments=False,
remove_reverse_vars_from_lp=False,
hard_remove=False,
abundance_threshold=1e-6,
diet=None,
vaginal=False,
essential_metabolites=None,
micronutrients=None,
force_uptake=True,
diet_threshold=0.8,
compress=True,
compute_metrics=True,
force=False,
sample_prefix='mc'
):
"""Build community COBRA models using mgpipe-like compartments and constraints.
This function is pymgpipe's main model building function, and can be used to build models for either one or multiple samples.
Args:
coverage_file (pandas.DataFrame | str): Abundance matrix with taxa as rows and samples as columns
taxa_dir (str): Directory containing individual strain/species taxa models (file names corresponding to index of coverage matrix)
out_dir (str): Directory to save output of this function (models, LP problems, etc.), defaults to cwd
diet_fecal_compartments (bool): Build models with mgpipe's diet/fecal compartmentalization, defaults to False
coupling_constraints (bool): Add mgpipe's abundance coupling constraints to internal reactions, defaults to True
remove_reverse_vars_from_lp (bool): Remove reverse variables that are added into COBRApy models
hard_remove (bool): If set to True, variables will be removed entirely from models, which can be a time consuming process. Having it set to False will `remove` them by setting their bounds to 0.
diet (str): Name of diet or file you want to impose on models (can be one diet or personalized)
vaginal (bool): Impose vaginal diet constraints to support vaginal diet
essential_metabolites (list): List of metabolites to include as essential within the diet
micronutrients (list): List of metabolites to include as micronutrients within diet
force_uptake (bool): Force minimum uptake of nutrients from diet
diet_threshold (float): Threshold on imposed diet uptake, i.e. lb < metabolite < (threshold * lb)
solver (str): LP solver (gurobi or cplex) used to solve models, defaults to gurobi
parallel (bool): Samples will be built in parallel if set to True
threads (int): Number of threads to use if building in parallel
lp_type (str): File type for LP problem (either .mps or .lp), defaults to .mps
cobra_type (str): File type for COBRA model (.xml, .mat, .json), defaults to .xml
compress (bool): Models and LP problems will be saved as compressed files if set to True, defaults to True
compute_metrics (bool): Compute diversity metrics for built models, defaults to True
Notes:
COBRA models written to *out_dir/models/*\n
LP problems written to *out_dir/problems/*\n
All modifications done at the level of the LP (i.e. coupling constraints, diet) are not saved when writing COBRA models to file. For this reason, it is always recommended to work with LP models in the `problems/` folder.
"""
start = time.time()
cobra_config.solver = solver
gc.disable()
Path(out_dir).mkdir(exist_ok=True)
taxa_dir = taxa_dir + "/" if taxa_dir[-1] != "/" else taxa_dir
out_dir = out_dir + "/" if out_dir[-1] != "/" else out_dir
model_dir = out_dir + "models/"
problem_dir = out_dir + "problems/"
Path(model_dir).mkdir(exist_ok=True)
Path(problem_dir).mkdir(exist_ok=True)
formatted = _format_coverage_file(coverage_file, out_dir, sample_prefix)
samples_to_run = list(formatted.columns)
taxa = list(formatted.index)
print(
"Found coverage file with %s samples and %s unique taxa"
% (len(samples_to_run), len(taxa))
)
if samples is not None:
samples_to_run = samples if isinstance(samples, list) else [samples]
if len(samples_to_run) <= 1:
parallel = False
threads = os.cpu_count() - 1 if threads == -1 else threads
threads = min(threads, len(samples_to_run))
print("Building %s models..." % len(samples_to_run))
print("\n----------------Parameters----------------")
print("Diet/fecal compartments- %s" % str(diet_fecal_compartments).upper())
print("Coupling constraints- %s" % str(coupling_constraints).upper())
print("Parallel- %s" % str(parallel).upper())
print("Threads- %s" % str(threads).upper())
print("Solver- %s" % solver.upper())
print("LP type- %s" % str(lp_type.split(".")[1]).upper())
print("COBRA type- %s" % str(cobra_type.split(".")[1]).upper())
print("compress- %s" % str(compress).upper())
print("Output directory- %s" % str(out_dir).upper())
_func = partial(
_inner,
formatted,
taxa_dir,
solver,
model_dir,
problem_dir,
lp_type,
cobra_type,
coupling_constraints,
diet_fecal_compartments,
remove_reverse_vars_from_lp,
hard_remove,
diet,
vaginal,
essential_metabolites,
micronutrients,
force_uptake,
diet_threshold,
abundance_threshold,
compress,
compute_metrics,
force
)
if parallel:
p = Pool(threads, initializer=_mute)
p.daemon = False
metrics = list(
tqdm.tqdm(p.imap(_func, samples_to_run), total=len(samples_to_run))
)
p.close()
p.join()
else:
metrics = tqdm.tqdm(list(map(_func, samples_to_run)), total=len(samples_to_run))
if compute_metrics:
try:
abundance_df = pd.DataFrame({m['sample']:m['reaction_abundance'] for m in metrics})
abundance_df.to_csv(out_dir+'reaction_abundance.csv')
abundance_df[abundance_df > 0] = 1
abundance_df.to_csv(out_dir+'reaction_content.csv')
unique_reactions = [(len(m['taxa']),len(m['unique_reactions'])) for m in metrics]
plt.scatter(*zip(*unique_reactions))
plt.xlabel('# Taxa', fontsize=12)
plt.ylabel('# Unique Reactions', fontsize=12)
plt.xticks(np.arange(0,max([r[0] for r in unique_reactions])+1,1))
plt.title('Metabolic Diversity')
plt.savefig(out_dir+'metabolic_diversity.png')
plt.clf()
# PCoA plot of reaction presence
abundance_df = abundance_df.fillna(0).T
distance_matrix = squareform(pdist(abundance_df, 'braycurtis'))
my_pcoa = skbio.stats.ordination.pcoa(distance_matrix)
pcoa_points = my_pcoa.samples
pcoa_points.set_index(abundance_df.index,inplace=True)
ax = sns.scatterplot(data=pcoa_points, x='PC1', y='PC2', hue=pcoa_points.index, ax=plt.gca())
str1 = 'PC1 ' + str(round(my_pcoa.proportion_explained[0], 3) * 100) + '% of explained variance'
str2 = 'PC2 ' + str(round(my_pcoa.proportion_explained[1], 3) * 100) + '% of explained variance'
ax.set(xlabel=str1, ylabel=str2, title='PCoA of reaction presence (braycurtis)')
plt.legend([],[], frameon=False)
for line in range(0,pcoa_points.shape[0]):
ax.text(pcoa_points['PC1'][line]+0.001, pcoa_points['PC2'][line]+0.001,
pcoa_points.index[line], horizontalalignment='left',
size='small', color='black', weight='light')
plt.savefig(out_dir+'reaction_pcoa.png')
except Exception as e:
logger.warn('Ran into problem while saving metrics...skipping this step!\n%s'%e)
pass
print("-------------------------------------------------------")
logger.info("Finished building %s models and associated LP problems!" % len(metrics))
logger.info('Process took %s minutes to run...'%round((time.time()-start)/60,3))
def _inner(
coverage_df,
taxa_dir,
solver,
model_dir,
problem_dir,
lp_type,
cobra_type,
coupling_constraints,
diet_fecal_compartments,
remove_reverse_vars_from_lp,
hard_remove,
diet,
vaginal,
essential_metabolites,
micronutrients,
force_uptake,
diet_threshold,
abundance_threshold,
compress,
compute_metrics,
force,
sample_label,
):
model_out = (
model_dir + "%s.%s" % (sample_label, cobra_type.split(".")[1])
if not compress
else model_dir + sample_label + ".xml.gz"
)
lp_out = problem_dir + "%s.%s" % (sample_label, lp_type.split(".")[1])
pymgpipe_model = None
metrics = None
if not force and os.path.exists(model_out):
pymgpipe_model = load_cobra_model(model_out)
else:
with suppress_stdout():
pymgpipe_model = build(
sample=sample_label,
abundances=coverage_df,
taxa_directory=taxa_dir,
threshold=abundance_threshold,
diet_fecal_compartments=diet_fecal_compartments,
solver=solver
)
force = True
write_cobra_model(pymgpipe_model, model_out)
if compute_metrics:
metrics = _compute_diversity_metrics(pymgpipe_model)
if metrics is None or len(metrics)==0:
logger.warning('Unable to compute diversity metrics for %s'%pymgpipe_model.name)
if force or (not os.path.exists(lp_out) and not os.path.exists(lp_out+'.gz') and not os.path.exists(lp_out+'.7z')):
# ----- START OPTLANG MODIFICATIONS -----
if remove_reverse_vars_from_lp:
try:
logger.info('Removing variables from %s...'%sample_label)
remove_reverse_vars(pymgpipe_model,hard_remove)
except Exception:
logger.warning('Failed to remove reverse variables!')
if diet is not None:
add_diet_to_model(pymgpipe_model, diet, force_uptake, essential_metabolites, micronutrients, vaginal, diet_threshold)
if coupling_constraints:
try:
logger.info('Adding coupling constraints to %s...'%sample_label)
add_coupling_constraints(pymgpipe_model)
except Exception:
logger.warning("Failed to add coupling constraints!")
write_lp_problem(pymgpipe_model, out_file=lp_out, compress=compress, force=True)
else:
logger.info('Skipping %s because LP problem already exists!'%sample_label)
del pymgpipe_model
gc.collect()
return metrics
def _format_coverage_file(coverage_file, out_dir = './', sample_prefix = None):
coverage = load_dataframe(coverage_file)
if sample_prefix is None:
return coverage
conversion_file_path = out_dir + "sample_label_conversion.csv"
try:
sample_conversion_dict = pd.read_csv(conversion_file_path, index_col=0).iloc[:, 0].to_dict()
if set(sample_conversion_dict.keys()) != set(coverage.columns):
raise Exception(
"Provided label conversion file %s does not provide labels for all samples!"
% conversion_file_path
)
except:
sample_conversion_dict = {
v: sample_prefix + str(i + 1) for i, v in enumerate(sorted(coverage.columns))
}
pd.DataFrame({"conversion": sample_conversion_dict}).to_csv(conversion_file_path)
return coverage.rename(columns=sample_conversion_dict)
def _mute():
sys.stdout = open(os.devnull, "w")