| import os |
| import pandas as pd |
| import numpy as np |
| from sklearn.model_selection import KFold |
| from sklearn.metrics import mean_squared_error, r2_score |
| from scipy.stats import pearsonr, spearmanr |
| from catboost import CatBoostRegressor |
| import matplotlib.pyplot as plt |
|
|
| |
| plt.rcParams.update({ |
| 'font.family': 'serif', |
| 'font.size': 13, |
| 'axes.labelsize': 14, |
| 'axes.titlesize': 14, |
| 'xtick.labelsize': 12, |
| 'ytick.labelsize': 12, |
| 'legend.fontsize': 12 |
| }) |
|
|
| |
| data = pd.read_csv("/storage/group/cdm8/default/BindPred/embeddings/xxx.csv") |
|
|
| |
| data['Ligand Features'] = data['Ligand Features'].fillna('') |
| data['Receptor Features'] = data['Receptor Features'].fillna('') |
|
|
| |
| data['Ligand Features'] = data['Ligand Features'].apply( |
| lambda x: [float(i) for i in str(x).split(',') if i.strip()] if isinstance(x, str) else [] |
| ) |
| data['Receptor Features'] = data['Receptor Features'].apply( |
| lambda x: [float(i) for i in str(x).split(',') if i.strip()] if isinstance(x, str) else [] |
| ) |
|
|
| |
| data['Combined Features'] = data.apply( |
| lambda row: np.concatenate((row['Ligand Features'], row['Receptor Features'])) |
| if len(row['Ligand Features']) > 0 and len(row['Receptor Features']) > 0 else np.array([]), |
| axis=1 |
| ) |
|
|
| |
| data = data[data['Combined Features'].apply(len) > 0] |
|
|
| |
| if "KD(M)" not in data.columns or data["KD(M)"].isnull().any(): |
| raise ValueError("Missing or NaN values in 'KD(M)' column.") |
|
|
| |
| X = np.vstack(data['Combined Features']) |
| y = np.log10(data['KD(M)']) |
|
|
| |
| kf = KFold(n_splits=5, shuffle=True, random_state=42) |
| all_y_true = [] |
| all_y_pred = [] |
| test_indices_all = [] |
|
|
| |
| output_dir = "new_plt" |
| os.makedirs(output_dir, exist_ok=True) |
|
|
| for fold, (train_index, test_index) in enumerate(kf.split(X)): |
| X_train, X_test = X[train_index], X[test_index] |
| y_train, y_test = y.iloc[train_index], y.iloc[test_index] |
|
|
| model = CatBoostRegressor( |
| iterations=2000, |
| learning_rate=0.08, |
| depth=4, |
| verbose=500, |
| task_type="GPU", |
| devices='0' |
| ) |
| model.fit(X_train, y_train) |
| y_pred = model.predict(X_test) |
|
|
| all_y_true.extend(y_test) |
| all_y_pred.extend(y_pred) |
| test_indices_all.extend(test_index) |
|
|
| |
| all_y_true = np.array(all_y_true) |
| all_y_pred = np.array(all_y_pred) |
|
|
| |
| pcc, _ = pearsonr(all_y_true, all_y_pred) |
| srcc, _ = spearmanr(all_y_true, all_y_pred) |
| rmse = np.sqrt(mean_squared_error(all_y_true, all_y_pred)) |
| r2 = r2_score(all_y_true, all_y_pred) |
|
|
| |
| errors = np.abs(all_y_true - all_y_pred) |
|
|
| |
| plt.figure(figsize=(5, 5)) |
| plt.title("ESM2 Embeddings", fontsize=15, pad=10) |
|
|
| sc = plt.scatter( |
| all_y_true, |
| all_y_pred, |
| s=25, |
| c=errors, |
| cmap='Reds', |
| alpha=0.9, |
| edgecolors='black', |
| linewidth=0.4, |
| marker='^' |
| ) |
|
|
| |
| plt.plot([-15, -2], [-15, -2], color='black', linestyle='--', linewidth=1) |
|
|
| |
| plt.xlabel("Experimental Log10(Kd)", fontsize=14, labelpad=10) |
| plt.ylabel("BindPred Prediction of Log10(Kd)", fontsize=14, labelpad=10) |
| plt.xlim(-15.0, -2.0) |
| plt.ylim(-15.0, -2.0) |
| plt.gca().set_aspect('equal', adjustable='box') |
|
|
| |
| plt.text(0.05, 0.95, |
| f"PCC: {pcc:.3f}\nRMSE: {rmse:.3f}\nR²: {r2:.3f}", |
| transform=plt.gca().transAxes, |
| fontsize=12, |
| verticalalignment='top', |
| horizontalalignment='left', |
| bbox=dict(facecolor='white', edgecolor='gray', boxstyle='round,pad=0.3')) |
|
|
| |
| cbar = plt.colorbar(sc) |
| cbar.set_label("Absolute Error", fontsize=12) |
|
|
| |
| plt.tight_layout() |
| plt.savefig(os.path.join(output_dir, 'esm2_plot.png'), dpi=700) |
| plt.savefig(os.path.join(output_dir, 'esm2_plot.pdf'), dpi=700) |
| plt.show() |
|
|
| |
| records = [] |
| for idx, test_idx in enumerate(test_indices_all): |
| row = data.iloc[test_idx] |
| record = { |
| "PDB_ID": row.get("PDB_ID", "NA"), |
| "Mutation": row.get("Mutation", "NA"), |
| "Actual_log10Kd": all_y_true[idx], |
| "Predicted_log10Kd": all_y_pred[idx] |
| } |
| records.append(record) |
|
|
| df_preds = pd.DataFrame(records) |
| csv_path = os.path.join(output_dir, "ESM2_predictions.csv") |
| df_preds.to_csv(csv_path, index=False) |
| print(f"Saved prediction results to {csv_path}") |