In this page, we offer an example analysis of the DEBIAS-M classifier. For this analysis, we use the collection of HIV studies that is available on Synapse. For a full set of analyses performed by DEBIAS-M on this dataset, please refer to the DEBIAS-M manuscript.
## import packages
import numpy as np
import pandas as pd
from debiasm import DebiasMClassifier
from debiasm.torch_functions import rescale
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import pairwise_distances
from sklearn.linear_model import LogisticRegression
from skbio.stats.ordination import pcoa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import os
def load_HIVRC(data_path):
df = pd.read_csv(os.path.join(data_path,
'insight.merged_otus.txt'),
sep='\t',
index_col=0
).iloc[:, :-1].T
## we generally recommend some minimum presence OTU filter
df = df.loc[:,((df>0).sum(axis=0) > df.shape[0]*.05 )]
md = pd.read_csv(os.path.join(data_path,
'metadata.tsv'),
sep='\t')
md = md.set_index('SeqID').loc[df.index]
md['label'] = md['hivstatus']==1
return((df, md))
df, md = load_HIVRC(data_path='.') ## this assumes the data is downloaded to the same directory, can change as needed
X_with_batch = pd.concat([pd.Series(pd.Categorical(md.Study).codes,
index=md.index,
name='batch'),
df
], axis=1)
# Compute a distance matrix (using Bray-Curtis distance)
distance_matrix = pairwise_distances(rescale(df.values),
metric='braycurtis')
# Perform PCoA analysis
ordination_result = pcoa(distance_matrix, number_of_dimensions=2)
# Extract the first two principal coordinates
x_coords = ordination_result.samples.iloc[:, 0]
y_coords = ordination_result.samples.iloc[:, 1]
plt.figure(figsize=(8,8))
sns.scatterplot(x=x_coords,
y=y_coords,
hue=md.Study.values,
s=100
)
# Add labels and title
plt.xlabel('PCoA 1')
plt.ylabel('PCoA 2')
plt.title('PCoA of the raw data')
plt.legend().remove()
plt.grid(True)
plt.show()
We demonstrate the following code using DEBIAS-M with default parameters, using one study as a test set.
test_sd_number=5 ## index of the test study
train_inds = md.Study!=md.Study.unique()[test_sd_number]
## run DEBIAS-M
dmc=DebiasMClassifier(x_val=X_with_batch.loc[~train_inds].values) ## provide the read counts for the test set
dmc.fit(X_with_batch.loc[train_inds].values, md.label.loc[train_inds].values) ## the read counts and labels for the training studies
## transform the data
x_dmc=dmc.transform(X_with_batch)
# Generate a PCoA plot
## Compute a distance matrix (using Bray-Curtis distance)
distance_matrix = pairwise_distances(x_dmc.values,
metric='braycurtis')
# Perform PCoA analysis
ordination_result = pcoa(distance_matrix, number_of_dimensions=2)
# Extract the first two principal coordinates
x_coords = ordination_result.samples.iloc[:, 0]
y_coords = ordination_result.samples.iloc[:, 1]
plt.figure(figsize=(8,8))
sns.scatterplot(x=x_coords,
y=y_coords,
hue=md.Study.values,
s=100
)
# Add labels and title
plt.xlabel('PCoA 1')
plt.ylabel('PCoA 2')
plt.title('PCoA plot of the DEBIAS-ed data')
plt.legend().remove()
plt.grid(True)
plt.show()
# Function to plot ROC curves for multiple prediction arrays
def plot_roc_curves(predictions_list,
y_true,
name_list):
plt.figure(figsize=(8,8))
for i, y_pred in enumerate(predictions_list):
fpr, tpr, _ = roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'{name_list[i]} (AUROC = {roc_auc:.2f})', linewidth=5)
plt.plot([0, 1], [0, 1], 'k--', lw=5)
plt.xlabel('False Positive Rate')
plt.ylabel('T"rue Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(False)
plt.title('ROC curve of DEBIAS-M cross-study HIV predictions')
## run baseline linear model on raw data (equivalent to debias-m's model without the bias inference)
lr=LogisticRegression(penalty='none',
max_iter=int(1e3),
solver='newton-cg'
)
lr.fit(rescale(df.loc[train_inds]), md.label.loc[train_inds])
plot_roc_curves([lr.predict_proba(df.loc[~train_inds])[:, 1],
dmc.predict_proba(X_with_batch.loc[~train_inds].values)[:, 1] ],
md.label.values[~train_inds.values],
['Raw',
'DEBIAS-M'
])
plt.show()
See also:
Multitask DEBIAS-M
Classifier
The multitask DEBIAS-M classifier
DebiasMRegressor
Implementation of a DEBIAS-M regressor
For more
background on DEBIAS-M, refer to our
manuscript.