hbp5181 commited on
Commit
2e7b09e
·
verified ·
1 Parent(s): e0bcce2

Upload train.py

Browse files

BindPred fivefold cross-validation

Files changed (1) hide show
  1. train.py +93 -0
train.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.model_selection import KFold
5
+ from sklearn.metrics import mean_squared_error, r2_score
6
+ from scipy.stats import pearsonr, ttest_ind
7
+ from catboost import CatBoostRegressor
8
+
9
+ # Load dataset
10
+ data = pd.read_csv("embeddings/ESM2_interaction.csv")
11
+
12
+ # Fill missing feature strings
13
+ for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
14
+ data[col] = data[col].fillna("")
15
+
16
+ # Parse comma-separated floats
17
+ for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
18
+ data[col] = data[col].apply(
19
+ lambda s: [float(x) for x in str(s).split(",") if x.strip()]
20
+ )
21
+
22
+ # Build feature arrays
23
+ X_ligand = np.vstack(data["Ligand Features"].values)
24
+ X_receptor = np.vstack(data["Receptor Features"].values)
25
+ X_physical = np.vstack(data["Physical Features"].values)
26
+
27
+ # Convert KD(M) into log10 scale
28
+ raw_y = data["KD(M)"].values
29
+ y = np.log10(raw_y) # assumes all KD values are positive
30
+
31
+ records = []
32
+
33
+ # Repeat 5×5-fold CV, with and without physical features
34
+ for repeat in range(1, 6):
35
+ kf = KFold(n_splits=5, shuffle=True, random_state=repeat)
36
+
37
+ for include_phys in (False, True):
38
+ X_base = np.hstack([X_ligand, X_receptor])
39
+ X_full = np.hstack([X_base, X_physical])
40
+ X_data = X_full if include_phys else X_base
41
+
42
+ for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X_data), start=1):
43
+ X_train, X_test = X_data[train_idx], X_data[test_idx]
44
+ y_train, y_test = y[train_idx], y[test_idx]
45
+
46
+ # Initialize with your chosen hyperparameters and GPU support
47
+ model = CatBoostRegressor(
48
+ iterations=2000,
49
+ learning_rate=0.08,
50
+ depth=4,
51
+ verbose=500,
52
+ task_type="GPU",
53
+ devices="0"
54
+ )
55
+
56
+ # Train and time this fold
57
+ model.fit(X_train, y_train)
58
+
59
+ preds = model.predict(X_test)
60
+ rmse = np.sqrt(mean_squared_error(y_test, preds))
61
+ r2 = r2_score(y_test, preds)
62
+ pcc = pearsonr(y_test, preds)[0]
63
+
64
+ records.append({
65
+ "repeat": repeat,
66
+ "fold": fold_idx,
67
+ "with_physical": include_phys,
68
+ "pearson_r": pcc,
69
+ "r2": r2,
70
+ "rmse": rmse
71
+ })
72
+
73
+ # Aggregate metrics
74
+ metrics_df = pd.DataFrame(records)
75
+
76
+ # Save to CSV
77
+ out_dir = "metrics"
78
+ os.makedirs(out_dir, exist_ok=True)
79
+ csv_path = os.path.join(out_dir, "InteractionMetrics.csv")
80
+ metrics_df.to_csv(csv_path, index=False)
81
+ print(f"All metrics saved to {csv_path}")
82
+
83
+ # Conduct independent t tests for each metric
84
+ results = {}
85
+ for metric in ["pearson_r", "r2", "rmse"]:
86
+ grp_with = metrics_df.loc[metrics_df.with_physical, metric]
87
+ grp_without = metrics_df.loc[~metrics_df.with_physical, metric]
88
+ t_stat, p_val = ttest_ind(grp_with, grp_without, equal_var=False)
89
+ results[metric] = (t_stat, p_val)
90
+
91
+ print("\nT test results comparing with vs without physical features:")
92
+ for m, (t_stat, p_val) in results.items():
93
+ print(f"{m} → t = {t_stat:.3f}, p = {p_val:.3f}")