antoniaebner commited on
Commit
9b322e1
·
1 Parent(s): 33fd417

restructure feature preprocessing

Browse files
Files changed (4) hide show
  1. config/config.json +2 -0
  2. src/model.py +36 -87
  3. src/preprocess.py +9 -0
  4. train.py +3 -1
config/config.json CHANGED
@@ -2,6 +2,8 @@
2
  "seed": 0,
3
  "ecfp_radius": 3,
4
  "ecfp_fpsize": 8192,
 
 
5
  "model_path": "checkpoints/rf_alltasks.joblib",
6
  "data_folder": "data/",
7
  "log_folder": "logs/",
 
2
  "seed": 0,
3
  "ecfp_radius": 3,
4
  "ecfp_fpsize": 8192,
5
+ "feature_minvar": 0.01,
6
+ "feature_maxcorr": 0.95,
7
  "model_path": "checkpoints/rf_alltasks.joblib",
8
  "data_folder": "data/",
9
  "log_folder": "logs/",
src/model.py CHANGED
@@ -11,12 +11,10 @@ import joblib
11
 
12
  import numpy as np
13
 
14
- from sklearn.base import BaseEstimator, TransformerMixin
15
  from sklearn.ensemble import RandomForestClassifier
16
- from sklearn.feature_selection import VarianceThreshold
17
  from sklearn.preprocessing import StandardScaler
18
- from statsmodels.distributions.empirical_distribution import ECDF
19
 
 
20
  from .utils import TASKS
21
 
22
 
@@ -45,8 +43,8 @@ class Tox21RFClassifier:
45
  )
46
  for task in self.tasks
47
  }
48
- self.feature_selection = FeatureSelector()
49
- self.ecdf = ECDFQuantileCreator(rdkit_desc_idxs)
50
  self.scaler = StandardScaler()
51
 
52
  def load_model(self, path: str) -> None:
@@ -61,9 +59,8 @@ class Tox21RFClassifier:
61
  self.scaler = model["scalers"]
62
  self.rdkit_desc_idxs = model["rdkit_desc_idxs"]
63
 
64
- self.feature_selection.feature_selection = model["feature_selections"]
65
- self.ecdf.ecdfs = model["ecdfs"]
66
- self.ecdf.to_adapt = model["rdkit_desc_idxs"]
67
 
68
  def save_model(self, path: str) -> None:
69
  """Saves the model to a given path
@@ -76,26 +73,47 @@ class Tox21RFClassifier:
76
 
77
  model = {
78
  "models": self.models,
79
- "feature_selections": self.feature_selection.feature_selection,
80
- "ecdfs": self.ecdf.ecdfs,
81
  "scalers": self.scaler,
82
  "rdkit_desc_idxs": self.rdkit_desc_idxs,
83
  }
84
 
85
  joblib.dump(model, path)
86
 
87
- def fit_preprocessing(self, X: np.ndarray) -> None:
88
  X_ = X.copy()
89
 
90
- X_ = self.ecdf.fit_transform(X_)
91
- X_ = self.feature_selection.fit_transform(X_)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  X_ = self.scaler.fit(X_)
93
 
94
- def preprocess(self, X: np.ndarray) -> None:
95
  X_ = X.copy()
96
 
97
- X_ = self.ecdf.transform(X_)
98
- X_ = self.feature_selection.transform(X_)
 
 
99
  X_ = self.scaler.transform(X_)
100
  return X_
101
 
@@ -109,8 +127,7 @@ class Tox21RFClassifier:
109
  """
110
  assert task in self.tasks, f"Unknown task: {task}"
111
 
112
- X_ = self.preprocess(X)
113
-
114
  self.models[task].fit(X_, y)
115
 
116
  def predict(self, task: str, X: np.ndarray) -> np.ndarray:
@@ -128,73 +145,5 @@ class Tox21RFClassifier:
128
  len(X.shape) == 2
129
  ), f"Function expects 2D np.array. Current shape: {X.shape}"
130
 
131
- X_ = self.preprocess(X)
132
  return self.models[task].predict_proba(X_)[:, 1]
133
-
134
-
135
- class FeatureSelector(BaseEstimator, TransformerMixin):
136
- def __init__(self, min_var=0.01, max_corr=0.95):
137
- self.min_var = min_var
138
- self.max_corr = max_corr
139
- self.feature_selection = None
140
-
141
- def fit(self, X, y=None):
142
- # select features with at least 0.01 variation
143
- var_thresh = VarianceThreshold(threshold=self.min_var)
144
- feature_selection = var_thresh.fit(X).get_support(
145
- indices=True
146
- ) # list containing selected feature indices
147
-
148
- n_features_preselected = len(feature_selection)
149
-
150
- # Remove highly correlated features
151
- corr_matrix = np.corrcoef(X[:, feature_selection], rowvar=False)
152
- upper_tri = np.triu(corr_matrix, k=1)
153
- to_keep = np.ones((n_features_preselected,), dtype=bool)
154
- for i in range(upper_tri.shape[0]):
155
- for j in range(upper_tri.shape[1]):
156
- if upper_tri[i, j] > self.max_corr:
157
- to_keep[j] = False
158
-
159
- self.feature_selection = feature_selection[to_keep]
160
-
161
- return self
162
-
163
- def transform(self, X):
164
- return X[:, self.feature_selection]
165
-
166
-
167
- class ECDFQuantileCreator(BaseEstimator, TransformerMixin):
168
- def __init__(self, to_adapt=None):
169
- self.to_adapt = to_adapt
170
- self.ecdfs = None
171
-
172
- def fit(self, X, y=None):
173
- _, n_feat = X.shape
174
-
175
- if self.to_adapt is None:
176
- self.to_adapt = np.arange(n_feat)
177
- else:
178
- assert (
179
- self.to_adapt < n_feat
180
- ).all(), "passed to_adapt list contains more features than in X!"
181
- selected_feat = X[:, self.to_adapt].copy()
182
- ecdfs = []
183
- for column in range(selected_feat.shape[1]):
184
- raw_values = selected_feat[:, column].reshape(-1)
185
- ecdfs.append(ECDF(raw_values))
186
- self.ecdfs = ecdfs
187
- return self
188
-
189
- def transform(self, X):
190
- selected_feat = X[:, self.to_adapt].copy()
191
- quantiles = np.zeros_like(selected_feat)
192
-
193
- for column in range(selected_feat.shape[1]):
194
- raw_values = selected_feat[:, column].reshape(-1)
195
- ecdf = self.ecdfs[column]
196
- q = ecdf(raw_values)
197
- quantiles[:, column] = q
198
-
199
- X[:, self.to_adapt] = quantiles
200
- return X
 
11
 
12
  import numpy as np
13
 
 
14
  from sklearn.ensemble import RandomForestClassifier
 
15
  from sklearn.preprocessing import StandardScaler
 
16
 
17
+ from .preprocess import get_feature_selection, get_ecdfs, create_quantiles
18
  from .utils import TASKS
19
 
20
 
 
43
  )
44
  for task in self.tasks
45
  }
46
+ self.feature_selection = None
47
+ self.ecdfs = None
48
  self.scaler = StandardScaler()
49
 
50
  def load_model(self, path: str) -> None:
 
59
  self.scaler = model["scalers"]
60
  self.rdkit_desc_idxs = model["rdkit_desc_idxs"]
61
 
62
+ self.feature_selection = model["feature_selections"]
63
+ self.ecdfs = model["ecdfs"]
 
64
 
65
  def save_model(self, path: str) -> None:
66
  """Saves the model to a given path
 
73
 
74
  model = {
75
  "models": self.models,
76
+ "feature_selections": self.feature_selection,
77
+ "ecdfs": self.ecdfs,
78
  "scalers": self.scaler,
79
  "rdkit_desc_idxs": self.rdkit_desc_idxs,
80
  }
81
 
82
  joblib.dump(model, path)
83
 
84
+ def fit_preprocessing(self, X: np.ndarray, min_var=0.01, max_corr=0.95) -> None:
85
  X_ = X.copy()
86
 
87
+ _, n_feat = X.shape
88
+
89
+ if self.rdkit_desc_idxs is None:
90
+ self.rdkit_desc_idxs = np.arange(n_feat)
91
+ else:
92
+ assert (
93
+ self.rdkit_desc_idxs < n_feat
94
+ ).all(), "passed to_adapt list contains more features than in X!"
95
+
96
+ self.ecdfs = get_ecdfs(X_[:, self.rdkit_desc_idxs])
97
+ X_[:, self.rdkit_desc_idxs] = create_quantiles(
98
+ X_[:, self.rdkit_desc_idxs], self.ecdfs
99
+ )
100
+
101
+ # get feature selection
102
+ self.feature_selection = get_feature_selection(
103
+ X_, min_var=min_var, max_corr=max_corr
104
+ )
105
+ X_ = X_[:, self.feature_selection]
106
+
107
+ # fit scaler
108
  X_ = self.scaler.fit(X_)
109
 
110
+ def _preprocess(self, X: np.ndarray) -> None:
111
  X_ = X.copy()
112
 
113
+ X_[:, self.rdkit_desc_idxs] = create_quantiles(
114
+ X_[:, self.rdkit_desc_idxs], self.ecdfs
115
+ )
116
+ X_ = X_[:, self.feature_selection]
117
  X_ = self.scaler.transform(X_)
118
  return X_
119
 
 
127
  """
128
  assert task in self.tasks, f"Unknown task: {task}"
129
 
130
+ X_ = self._preprocess(X)
 
131
  self.models[task].fit(X_, y)
132
 
133
  def predict(self, task: str, X: np.ndarray) -> np.ndarray:
 
145
  len(X.shape) == 2
146
  ), f"Function expects 2D np.array. Current shape: {X.shape}"
147
 
148
+ X_ = self._preprocess(X)
149
  return self.models[task].predict_proba(X_)[:, 1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/preprocess.py CHANGED
@@ -13,6 +13,7 @@ import pandas as pd
13
 
14
  from datasets import load_dataset
15
  from sklearn.feature_selection import VarianceThreshold
 
16
 
17
  from rdkit import Chem, DataStructs
18
  from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
@@ -275,6 +276,14 @@ def create_descriptors(
275
  return features, clean_mol_mask
276
 
277
 
 
 
 
 
 
 
 
 
278
  def get_feature_selection(
279
  raw_features: np.ndarray, min_var=0.01, max_corr=0.95, **kwargs
280
  ) -> np.ndarray:
 
13
 
14
  from datasets import load_dataset
15
  from sklearn.feature_selection import VarianceThreshold
16
+ from statsmodels.distributions.empirical_distribution import ECDF
17
 
18
  from rdkit import Chem, DataStructs
19
  from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
 
276
  return features, clean_mol_mask
277
 
278
 
279
+ def get_ecdfs(raw_features: np.ndarray, **kwargs) -> np.ndarray:
280
+ ecdfs = []
281
+ for column in range(raw_features.shape[1]):
282
+ raw_values = raw_features[:, column].reshape(-1)
283
+ ecdfs.append(ECDF(raw_values))
284
+ return ecdfs
285
+
286
+
287
  def get_feature_selection(
288
  raw_features: np.ndarray, min_var=0.01, max_corr=0.95, **kwargs
289
  ) -> np.ndarray:
train.py CHANGED
@@ -88,7 +88,9 @@ def main(cfg):
88
  task_config=task_configs,
89
  rdkit_desc_idxs=rdkit_descr_idxs,
90
  )
91
- model.fit_preprocessing(data)
 
 
92
 
93
  logger.info("Start training.")
94
  for i, task in enumerate(model.tasks):
 
88
  task_config=task_configs,
89
  rdkit_desc_idxs=rdkit_descr_idxs,
90
  )
91
+ model.fit_preprocessing(
92
+ data, min_var=cfg["feature_minvar"], max_corr=cfg["feature_maxcorr"]
93
+ )
94
 
95
  logger.info("Start training.")
96
  for i, task in enumerate(model.tasks):