Disease model

This repository provides four machine learning models used for binary classification.

Training models

Common imports (used by all models)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

1.Random forest classifier

from sklearn.ensemble import RandomForestClassifier

# ==============================
# Load data
# ==============================
train_df = pd.read_csv('./Pos_Neg.csv')
unknown_df = pd.read_csv('./all_ATS.csv')

X_train = train_df.drop(columns=['Label'])
y_train = train_df['Label']
X_unknown = unknown_df.copy()

# ==============================
# Build model pipeline
# ==============================
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        class_weight='balanced'
    ))
])

# ==============================
# 10-fold cross-validation (probability output)
# ==============================
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

y_pred_proba_cv = cross_val_predict(
    pipeline,
    X_train,
    y_train,
    cv=cv,
    method='predict_proba'
)[:, 1]

# ==============================
# ROC and PR metrics
# ==============================
fpr, tpr, _ = roc_curve(y_train, y_pred_proba_cv)
roc_auc = auc(fpr, tpr)

precision, recall, _ = precision_recall_curve(y_train, y_pred_proba_cv)
ap_score = average_precision_score(y_train, y_pred_proba_cv)

# ==============================
# Visualization (Random Forest only)
# ==============================
plt.figure(figsize=(8, 4))

plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(recall, precision, lw=2, label=f'AP = {ap_score:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()

plt.tight_layout()
plt.show()

# ==============================
# Save CV predictions
# ==============================
train_df['predictions'] = y_pred_proba_cv
train_df.to_csv('04.10Fold_predictions_rf.csv', index=False)

# ==============================
# Train final model and feature importance
# ==============================
pipeline.fit(X_train, y_train)

importances = pipeline.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]

sorted_features = X_train.columns[indices]
sorted_importances = importances[indices]

pd.DataFrame(
    sorted_importances,
    index=sorted_features,
    columns=['Importance']
).to_csv('RF.importance.csv')

plt.figure()
plt.barh(sorted_features, sorted_importances)
plt.xlabel('Importance')
plt.title('Feature Importance (Random Forest)')
plt.gca().invert_yaxis()
plt.savefig('04.rf_importance.pdf', dpi=300)
plt.show()

# ==============================
# Predict unknown samples
# ==============================
X_unknown['Probability'] = pipeline.predict_proba(X_unknown)[:, 1]
X_unknown.to_csv('04.unknown_predictions_rf.csv', index=False)

2.Logistic regression

from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(
        random_state=42,
        max_iter=10000,
        class_weight='balanced'
    ))
])

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

aucs, aps = [], []

for train_idx, test_idx in cv.split(X_train, y_train):
    pipeline.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    y_prob = pipeline.predict_proba(X_train.iloc[test_idx])[:, 1]

    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_prob)
    aucs.append(auc(fpr, tpr))
    aps.append(average_precision_score(y_train.iloc[test_idx], y_prob))

print(f"Average AUC: {np.mean(aucs):.3f} ± {np.std(aucs):.3f}")
print(f"Average AP:  {np.mean(aps):.3f} ± {np.std(aps):.3f}")

pipeline.fit(X_train, y_train)

# Coefficient-based feature importance
coef = pipeline.named_steps['classifier'].coef_[0]
indices = np.argsort(np.abs(coef))[::-1]

pd.DataFrame(
    coef[indices],
    index=X_train.columns[indices],
    columns=['Coefficient']
).to_csv('04.lr.importance.csv')

X_unknown['Probability'] = pipeline.predict_proba(X_unknown)[:, 1]
X_unknown.to_csv('04.unknown_predictions_lr.csv', index=False)

3.Support vector machine (SVM)s

from sklearn.svm import SVC
from sklearn.inspection import permutation_importance

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC(probability=True, random_state=42))
])

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

aucs, aps = [], []

for train_idx, test_idx in cv.split(X_train, y_train):
    pipeline.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    y_prob = pipeline.predict_proba(X_train.iloc[test_idx])[:, 1]

    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_prob)
    aucs.append(auc(fpr, tpr))
    aps.append(average_precision_score(y_train.iloc[test_idx], y_prob))

print(f"Average AUC: {np.mean(aucs):.3f} ± {np.std(aucs):.3f}")
print(f"Average AP:  {np.mean(aps):.3f} ± {np.std(aps):.3f}")

pipeline.fit(X_train, y_train)

# Permutation importance
result = permutation_importance(
    pipeline,
    X_train,
    y_train,
    n_repeats=10,
    random_state=42
)

np.savetxt('04.svm.importance.mean.txt', result.importances_mean)
np.savetxt('04.svm.importance.std.txt', result.importances_std)

X_unknown['Probability'] = pipeline.predict_proba(X_unknown)[:, 1]
X_unknown.to_csv('04.unknown_predictions_svm.csv', index=False)

4.Neural network (MLP)

from sklearn.neural_network import MLPClassifier

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', MLPClassifier(
        hidden_layer_sizes=(100,),
        max_iter=1000,
        random_state=42
    ))
])

cv = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)

aucs, aps = [], []

for train_idx, test_idx in cv.split(X_train, y_train):
    pipeline.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    y_prob = pipeline.predict_proba(X_train.iloc[test_idx])[:, 1]

    fpr, tpr, _ = roc_curve(y_train.iloc[test_idx], y_prob)
    aucs.append(auc(fpr, tpr))
    aps.append(average_precision_score(y_train.iloc[test_idx], y_prob))

print(f"Average AUC: {np.mean(aucs):.3f} ± {np.std(aucs):.3f}")
print(f"Average AP:  {np.mean(aps):.3f} ± {np.std(aps):.3f}")

pipeline.fit(X_train, y_train)

X_unknown['Probability'] = pipeline.predict_proba(X_unknown)[:, 1]
X_unknown.to_csv('04.unknown_predictions_nn.csv', index=False)

Using pretrained models for prediction

This document explains how to use a trained sklearn .pkl model (saved as a full Pipeline) to predict probabilities for new, unseen data.

The workflow is designed to be reproducible, safe, and GitHub-ready.

Saving a trained model

Best practice: always save the entire Pipeline (including preprocessing and the classifier).

This ensures that the same data transformations used during training are automatically applied during prediction.

import joblib

# Train the pipeline on the full training dataset
pipeline.fit(X_train, y_train)

# Save the trained pipeline
joblib.dump(pipeline, 'rf_pipeline.pkl')

Why save the full pipeline?

  • Prevents data leakage

  • Avoids manual feature scaling during prediction

  • Guarantees consistency between training and inference

Predicting new data

import pandas as pd
import joblib

# ==============================
# Load trained pipeline
# ==============================
pipeline = joblib.load('rf_pipeline.pkl')

# ==============================
# Load new (unlabeled) data
# ==============================
# The feature columns must match the training data
new_data = pd.read_csv('./new_samples.csv')

# ==============================
# Predict probabilities
# ==============================
# Returns probability of the positive class (Label = 1)
probabilities = pipeline.predict_proba(new_data)[:, 1]

# ==============================
# Save prediction results
# ==============================
new_data['Probability'] = probabilities
new_data.to_csv('rf_predictions_new_data.csv', index=False)

This script:

  • Does not retrain the model

  • Automatically applies preprocessing

  • Can be reused for any saved pipeline (RF / LR / SVM / NN)

Notes and important considerations

Feature consistency

  • Feature names, order, and number must be identical to the training data

  • Missing or additional columns will cause errors or invalid predictions

Recommended safety check:

assert list(new_data.columns) == list(X_train.columns)

Do not re-scale new data manually

  • The scaler is already stored inside the pipeline

  • Do not apply StandardScaler.fit_transform() again

Incorrect:

scaler = StandardScaler()
new_data = scaler.fit_transform(new_data)

Correct:

pipeline.predict_proba(new_data)

predict() vs predict_proba()

  • predict() returns class labels (0 or 1)

  • predict_proba() returns probabilities

For ranking, scoring, or biological risk analysis, predict_proba() is strongly recommended.

Model compatibility

The same prediction script works for all saved models:

pipeline = joblib.load('lr_pipeline.pkl')
pipeline = joblib.load('svm_pipeline.pkl')
pipeline = joblib.load('nn_pipeline.pkl')

This is a key advantage of using sklearn Pipelines.