Spaces:

ml-jku
/

tox21_xgboost_classifier

Sleeping

App Files Files Community

antoniaebner commited on Oct 3

Commit

101fde6

1 Parent(s): 4a5bd87

update pipeline & add selected hyperparams

Browse files

Files changed (5) hide show

predict.py +15 -4
src/data.py +76 -184
src/preprocess.py +405 -0
src/train.py +139 -32
src/utils.py +2 -0

predict.py CHANGED Viewed

@@ -10,8 +10,9 @@ from collections import defaultdict
 import numpy as np
-from src.data import preprocess_molecules
 from src.model import Tox21XGBClassifier
 # ---------------------------------------------------------------------------------------
@@ -28,11 +29,21 @@ def predict(smiles_list: list[str]) -> dict[str, dict[str, float]]:
     """
     print(f"Received {len(smiles_list)} SMILES strings")
     # preprocessing pipeline
-    features, mol_mask = preprocess_molecules(
         smiles_list,
-        load_ecdf_path="assets/ecdfs.pkl",
-        load_scaler_path="assets/scaler.pkl",
     )
     print(f"{len(mol_mask) - sum(mol_mask)} molecules removed during cleaning")
     # setup model

 import numpy as np
 from src.model import Tox21XGBClassifier
+from src.preprocess import create_descriptors
+from src.utils import load_pickle, KNOWN_DESCR
 # ---------------------------------------------------------------------------------------
     """
     print(f"Received {len(smiles_list)} SMILES strings")
     # preprocessing pipeline
+    ecdfs_path = "assets/ecdfs.pkl"
+    scaler_path = "assets/scaler.pkl"
+    ecdfs = load_pickle(ecdfs_path)
+    scaler = load_pickle(scaler_path)
+    print(f"Loaded ecdfs from {ecdfs_path}")
+    print(f"Loaded scaler from {scaler_path}")
+    descriptors = KNOWN_DESCR
+    features, mol_mask = create_descriptors(
         smiles_list,
+        ecdfs=ecdfs,
+        scaler=scaler,
+        descriptors=descriptors,
     )
+    print(f"Created descriptors {descriptors} for molecules.")
     print(f"{len(mol_mask) - sum(mol_mask)} molecules removed during cleaning")
     # setup model

src/data.py CHANGED Viewed

@@ -6,193 +6,85 @@ As an input it takes a list of SMILES and it outputs a nested dictionary with
 SMILES and target names as keys.
 """
-import os
 import numpy as np
-from sklearn.preprocessing import StandardScaler
-from statsmodels.distributions.empirical_distribution import ECDF
-from rdkit import Chem, DataStructs
-from rdkit.Chem import Descriptors, rdFingerprintGenerator
-from rdkit.Chem.rdchem import Mol
-from .utils import USED_200_DESCR, Standardizer, load_pickle, write_pickle
-def preprocess_molecules(
-    smiles_list: list[str],
-    load_ecdf_path: str = "",
-    load_scaler_path: str = "",
-    save_ecdf_path: str = "",
-    save_scaler_path: str = "",
-) -> tuple[np.ndarray, list[int]]:
-    """Preprocessing pipeline for a list of molecules.
-    Args:
-        smiles_list (list[str]): list of SMILES
-        load_ecdf_path (str, optional): Path to load ECDFs from. Defaults to "".
-        load_scaler_path (str, optional): Path to load fitted StandardScaler from. Defaults to "".
-        save_ecdf_path (str, optional): Path to save calculated ECDFs. Defaults to "".
-        save_scaler_path (str, optional): Path to save fitted StandardScaler. Defaults to "".
-    Returns:
-        np.ndarray: normalized ECFPs fingerprints and RDKit descriptor quantiles
-        list[bool]: mask that contains False at index `i`, if molecule in `smiles_list` at
-            index `i` could not be cleaned and was removed.
-    """
-    assert not (
-        load_ecdf_path and save_ecdf_path
-    ), "Cannot pass 'load_ecdf_path' and 'save_ecdf_path' simultaneously"
-    assert not (
-        load_scaler_path and save_scaler_path
-    ), "Cannot pass 'load_scaler_path' and 'save_scaler_path' simultaneously"
-    ecdfs = (
-        load_pickle(load_ecdf_path)
-        if load_ecdf_path and os.path.exists(load_ecdf_path)
-        else None
-    )
-    scaler = (
-        load_pickle(load_scaler_path)
-        if load_scaler_path and os.path.exists(load_scaler_path)
-        else None
     )
-    # Create cleanded rdkit mol objects
-    mols, clean_mol_mask = create_cleaned_mol_objects(smiles_list)
-    print("Cleaned molecules")
-    # Create fingerprints and descriptors
-    ecfps = create_ecfp_fps(mols)
-    print("Created ECFP fingerprints")
-    rdkit_descrs = create_rdkit_descriptors(mols)
-    print("Created RDKit descriptors")
-    # Create and save ecdfs
-    if ecdfs is None:
-        print("Create ECDFs")
-        ecdfs = []
-        for column in range(rdkit_descrs.shape[1]):
-            raw_values = rdkit_descrs[:, column].reshape(-1)
-            ecdfs.append(ECDF(raw_values))
-        if save_ecdf_path:
-            write_pickle(save_ecdf_path, ecdfs)
-            print(f"Saved ECDFs under {save_ecdf_path}")
-    # Create quantiles
-    rdkit_descr_quantiles = create_quantiles(rdkit_descrs, ecdfs)
-    print("Created quantiles of RDKit descriptors")
-    # Concatenate features
-    raw_features = np.concatenate((ecfps, rdkit_descr_quantiles), axis=1)
-    if scaler is None:
-        scaler = StandardScaler()
-        scaler.fit(raw_features)
-        print("Fitted the StandardScaler")
-        if save_scaler_path:
-            write_pickle(save_scaler_path, scaler)
-            print(f"Saved the StandardScaler under {save_scaler_path}")
-    # Normalize feature vectors
-    normalized_features = scaler.transform(raw_features)
-    print("Normalized the molecule features")
-    return normalized_features, clean_mol_mask
-def create_cleaned_mol_objects(smiles: list[str]) -> list[Mol]:
-    """This function creates cleaned RDKit mol objects from a list of SMILES.
-    Args:
-        smiles (list[str]): list of SMILES
-    Returns:
-        list[Mol]: list of cleaned molecules
-        list[bool]: mask that contains False at index `i`, if molecule in `smiles` at
-            index `i` could not be cleaned and was removed.
-    """
-    sm = Standardizer(canon_taut=True)
-    clean_mol_mask = list()
-    mols = list()
-    for i, smile in enumerate(smiles):
-        mol = Chem.MolFromSmiles(smile)
-        standardized_mol, _ = sm.standardize_mol(mol)
-        is_cleaned = standardized_mol is not None
-        clean_mol_mask.append(is_cleaned)
-        if not is_cleaned:
-            continue
-        can_mol = Chem.MolFromSmiles(Chem.MolToSmiles(standardized_mol))
-        mols.append(can_mol)
-    return mols, clean_mol_mask
-def create_ecfp_fps(mols: list[Mol]) -> np.ndarray:
-    """This function ECFP fingerprints for a list of molecules.
-    Args:
-        mols (list[Mol]): list of molecules
-    Returns:
-        np.ndarray: ECFP fingerprints of molecules
-    """
-    ecfps = list()
-    for mol in mols:
-        fp_sparse_vec = rdFingerprintGenerator.GetCountFPs(
-            [mol], fpType=rdFingerprintGenerator.MorganFP
-        )[0]
-        fp = np.zeros((0,), np.int8)
-        DataStructs.ConvertToNumpyArray(fp_sparse_vec, fp)
-        ecfps.append(fp)
-    return np.array(ecfps)
-def create_rdkit_descriptors(mols: list[Mol]) -> np.ndarray:
-    """This function creates RDKit descriptors for a list of molecules.
-    Args:
-        mols (list[Mol]): list of molecules
-    Returns:
-        np.ndarray: RDKit descriptors of molecules
-    """
-    rdkit_descriptors = list()
-    for mol in mols:
-        descrs = []
-        for _, descr_calc_fn in Descriptors._descList:
-            descrs.append(descr_calc_fn(mol))
-        descrs = np.array(descrs)
-        descrs = descrs[USED_200_DESCR]
-        rdkit_descriptors.append(descrs)
-    return np.array(rdkit_descriptors)
-def create_quantiles(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
-    """Create quantile values for given features using the columns
-    Args:
-        raw_features (np.ndarray): values to put into quantiles
-        ecdfs (list): ECDFs to use
-    Returns:
-        np.ndarray: computed quantiles
-    """
-    quantiles = np.zeros_like(raw_features)
-    for column in range(raw_features.shape[1]):
-        raw_values = raw_features[:, column].reshape(-1)
-        ecdf = ecdfs[column]
-        q = ecdf(raw_values)
-        quantiles[:, column] = q
-    return quantiles

 SMILES and target names as keys.
 """
+from typing import Iterable, Literal
 import numpy as np
+import torch
+from .preprocess import normalize_features
+KNOWN_DESCR = ["ecfps", "rdkit_descr_quantiles", "maccs", "tox"]
+def get_descriptor_dataset(
+    data_path: str,
+    descriptors: Iterable[str] | Literal["all"],
+    scaler=None,
+    save_scaler_path: str = "data/scaler.pkl",
+    verbose=True,
+    normalize=True,
+):
+    if descriptors == "all":
+        descriptors = KNOWN_DESCR
+    assert isinstance(descriptors, Iterable), "Passed descriptors are not iterable!"
+    assert all(
+        [descr in KNOWN_DESCR for descr in descriptors]
+    ), f"Passed descriptors contains unknown descriptor types. Allowed descriptors: {KNOWN_DESCR}"
+    datafile = np.load(data_path)
+    if not isinstance(datafile, np.ndarray):
+        # concatenate all descriptors and normalize
+        data = np.concatenate([datafile[descr] for descr in descriptors], axis=1)
+        labels = datafile["labels"]
+    else:
+        print("NPY file passed, cannot select specific descriptors")
+        data, labels = datafile[:, :-12], datafile[:, -12:]
+    if normalize:
+        data, scaler = normalize_features(
+            data,
+            scaler=scaler,
+            save_scaler_path=save_scaler_path,
+            verbose=verbose,
+        )
+    # filter out unsanitized molecules
+    mask = ~np.isnan(data).any(axis=1)
+    data = data[mask]
+    labels = labels[mask]
+    assert data.shape[0] == labels.shape[0], (
+        f"Mismatch between data and labels: "
+        f"data has {data.shape[0]} samples, but labels has {labels.shape[0]} samples."
     )
+    return (data, labels, scaler)
+def get_torch_descriptor_dataset(
+    data_path: str,
+    descriptors: list[str],
+    scaler=None,
+    save_scaler_path: str = "data/scaler.pkl",
+    nan_to_num: int = -100,
+    verbose=True,
+    normalize=True,
+) -> torch.utils.data.TensorDataset:
+    data, labels, scaler = get_descriptor_dataset(
+        data_path,
+        descriptors,
+        scaler,
+        save_scaler_path,
+        verbose=verbose,
+        normalize=normalize,
+    )
+    labels = np.nan_to_num(labels, nan=nan_to_num)
+    dataset = torch.utils.data.TensorDataset(
+        torch.FloatTensor(data), torch.LongTensor(labels)
+    )
+    return dataset, scaler

src/preprocess.py ADDED Viewed

	@@ -0,0 +1,405 @@

+# pipeline taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+"""
+This files includes a the data processing for Tox21.
+As an input it takes a list of SMILES and it outputs a nested dictionary with
+SMILES and target names as keys.
+"""
+import os
+import argparse
+import json
+from typing import Iterable
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+from statsmodels.distributions.empirical_distribution import ECDF
+from datasets import load_dataset
+from rdkit import Chem, DataStructs
+from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
+from rdkit.Chem.rdchem import Mol
+from src.utils import (
+    TASKS,
+    KNOWN_DESCR,
+    HF_TOKEN,
+    USED_200_DESCR,
+    Standardizer,
+    load_pickle,
+    write_pickle,
+)
+parser = argparse.ArgumentParser(
+    description="Data preprocessing script for the Tox21 dataset"
+)
+parser.add_argument(
+    "--save_folder",
+    type=str,
+    default="data/",
+)
+parser.add_argument(
+    "--use_hf",
+    type=int,
+    default=0,
+)
+parser.add_argument(
+    "--path_ecdfs",
+    type=str,
+    default="data/ecdfs.pkl",
+)
+parser.add_argument(
+    "--tox_smarts_filepath",
+    type=str,
+    default="data/tox_smarts.json",
+)
+def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray]:
+    """This function creates cleaned RDKit mol objects from a list of SMILES.
+    Args:
+        smiles (list[str]): list of SMILES
+    Returns:
+        list[Mol]: list of cleaned molecules
+        np.ndarray[bool]: mask that contains False at index `i`, if molecule in `smiles` at
+            index `i` could not be cleaned and was removed.
+    """
+    sm = Standardizer(canon_taut=True)
+    clean_mol_mask = list()
+    mols = list()
+    for i, smile in enumerate(smiles):
+        mol = Chem.MolFromSmiles(smile)
+        standardized_mol, _ = sm.standardize_mol(mol)
+        is_cleaned = standardized_mol is not None
+        clean_mol_mask.append(is_cleaned)
+        if not is_cleaned:
+            continue
+        can_mol = Chem.MolFromSmiles(Chem.MolToSmiles(standardized_mol))
+        mols.append(can_mol)
+    return mols, np.array(clean_mol_mask)
+def create_ecfp_fps(mols: list[Mol]) -> np.ndarray:
+    """This function ECFP fingerprints for a list of molecules.
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: ECFP fingerprints of molecules
+    """
+    ecfps = list()
+    for mol in mols:
+        fp_sparse_vec = rdFingerprintGenerator.GetCountFPs(
+            [mol], fpType=rdFingerprintGenerator.MorganFP
+        )[0]
+        fp = np.zeros((0,), np.int8)
+        DataStructs.ConvertToNumpyArray(fp_sparse_vec, fp)
+        ecfps.append(fp)
+    return np.array(ecfps)
+def create_maccs_keys(mols: list[Mol]) -> np.ndarray:
+    maccs = [MACCSkeys.GenMACCSKeys(x) for x in mols]
+    return np.array(maccs)
+def get_tox_patterns(filepath: str):
+    """This calculates tox features defined in tox_smarts.json.
+    Args:
+        mols: A list of Mol
+        n_jobs: If >1 multiprocessing is used
+    """
+    # load patterns
+    with open(filepath) as f:
+        smarts_list = [s[1] for s in json.load(f)]
+    # Code does not work for this case
+    assert len([s for s in smarts_list if ("AND" in s) and ("OR" in s)]) == 0
+    # Chem.MolFromSmarts takes a long time so it pays of to parse all the smarts first
+    # and then use them for all molecules. This gives a huge speedup over existing code.
+    # a list of patterns, whether to negate the match result and how to join them to obtain one boolean value
+    all_patterns = []
+    for smarts in smarts_list:
+        patterns = []  # list of smarts-patterns
+        # value for each of the patterns above. Negates the values of the above later.
+        negations = []
+        if " AND " in smarts:
+            smarts = smarts.split(" AND ")
+            merge_any = False  # If an ' AND ' is found all 'subsmarts' have to match
+        else:
+            # If there is an ' OR ' present it's enough is any of the 'subsmarts' match.
+            # This also accumulates smarts where neither ' OR ' nor ' AND ' occur
+            smarts = smarts.split(" OR ")
+            merge_any = True
+        # for all subsmarts check if they are preceded by 'NOT '
+        for s in smarts:
+            neg = s.startswith("NOT ")
+            if neg:
+                s = s[4:]
+            patterns.append(Chem.MolFromSmarts(s))
+            negations.append(neg)
+        all_patterns.append((patterns, negations, merge_any))
+    return all_patterns
+def create_tox_features(mols: list[Mol], patterns: list) -> np.ndarray:
+    """Matches the tox patterns against a molecule. Returns a boolean array"""
+    tox_data = []
+    for mol in mols:
+        mol_features = []
+        for patts, negations, merge_any in patterns:
+            matches = [mol.HasSubstructMatch(p) for p in patts]
+            matches = [m != n for m, n in zip(matches, negations)]
+            if merge_any:
+                pres = any(matches)
+            else:
+                pres = all(matches)
+            mol_features.append(pres)
+        tox_data.append(np.array(mol_features))
+    return np.array(tox_data)
+def create_rdkit_descriptors(mols: list[Mol]) -> np.ndarray:
+    """This function creates RDKit descriptors for a list of molecules.
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: RDKit descriptors of molecules
+    """
+    rdkit_descriptors = list()
+    for mol in mols:
+        descrs = []
+        for _, descr_calc_fn in Descriptors._descList:
+            descrs.append(descr_calc_fn(mol))
+        descrs = np.array(descrs)
+        descrs = descrs[USED_200_DESCR]
+        rdkit_descriptors.append(descrs)
+    return np.array(rdkit_descriptors)
+def create_quantiles(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
+    """Create quantile values for given features using the columns
+    Args:
+        raw_features (np.ndarray): values to put into quantiles
+        ecdfs (list): ECDFs to use
+    Returns:
+        np.ndarray: computed quantiles
+    """
+    quantiles = np.zeros_like(raw_features)
+    for column in range(raw_features.shape[1]):
+        raw_values = raw_features[:, column].reshape(-1)
+        ecdf = ecdfs[column]
+        q = ecdf(raw_values)
+        quantiles[:, column] = q
+    return quantiles
+def fill(features, mask, value=np.nan):
+    n_mols = len(mask)
+    n_features = features.shape[1]
+    data = np.zeros(shape=(n_mols, n_features))
+    data.fill(value)
+    data[~mask] = features
+    return data
+def normalize_features(
+    raw_features,
+    scaler=None,
+    save_scaler_path: str = "",
+    verbose=True,
+):
+    if scaler is None:
+        scaler = StandardScaler()
+        scaler.fit(raw_features)
+        if verbose:
+            print("Fitted the StandardScaler")
+        if save_scaler_path:
+            write_pickle(save_scaler_path, scaler)
+            if verbose:
+                print(f"Saved the StandardScaler under {save_scaler_path}")
+    # Normalize feature vectors
+    normalized_features = scaler.transform(raw_features)
+    if verbose:
+        print("Normalized molecule features")
+    return normalized_features, scaler
+def create_descriptors(
+    smiles,
+    ecdfs=None,
+    scaler=None,
+    descriptors: Iterable = KNOWN_DESCR,
+):
+    # Create cleanded rdkit mol objects
+    mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
+    print("Cleaned molecules")
+    features = []
+    if "ecfps" in descriptors:
+        # Create fingerprints and descriptors
+        ecfps = create_ecfp_fps(mols)
+        # expand using mol_mask
+        ecfps = fill(ecfps, ~clean_mol_mask)
+        features.append(ecfps)
+        print("Created ECFP fingerprints")
+    if "rdkit_descr_quantiles" in descriptors:
+        rdkit_descrs = create_rdkit_descriptors(mols)
+        print("Created RDKit descriptors")
+        # Create and save ecdfs
+        if ecdfs is None:
+            print("Create ECDFs")
+            ecdfs = []
+            for column in range(rdkit_descrs.shape[1]):
+                raw_values = rdkit_descrs[:, column].reshape(-1)
+                ecdfs.append(ECDF(raw_values))
+        # Create quantiles
+        rdkit_descr_quantiles = create_quantiles(rdkit_descrs, ecdfs)
+        # expand using mol_mask
+        rdkit_descr_quantiles = fill(rdkit_descr_quantiles, ~clean_mol_mask)
+        features.append(rdkit_descr_quantiles)
+        print("Created quantiles of RDKit descriptors")
+    if "maccs" in descriptors:
+        maccs = create_maccs_keys(mols)
+        maccs = fill(maccs, ~clean_mol_mask)
+        features.append(maccs)
+        print("Created MACCS keys")
+    if "tox" in descriptors:
+        tox_patterns = get_tox_patterns("assets/tox_smarts.json")
+        tox = create_tox_features(mols, tox_patterns)
+        tox = fill(tox, ~clean_mol_mask)
+        features.append(tox)
+        print("Created Tox features")
+    # concatenate features
+    raw_features = np.concatenate(features, axis=1)
+    # normalize with scaler if scaler is passed, else create scaler
+    features, _ = normalize_features(
+        raw_features,
+        scaler=scaler,
+        verbose=True,
+    )
+    return features, clean_mol_mask
+def main(args):
+    splits = ["train", "validation"]
+    ds = load_dataset("tschouis/tox21", token=HF_TOKEN)
+    for split in splits:
+        print(f"Preprocess {split} molecules")
+        smiles = list(ds[split]["smiles"])
+        # Create cleanded rdkit mol objects
+        mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
+        print("Cleaned molecules")
+        tox_patterns = get_tox_patterns(args.tox_smarts_filepath)
+        # Create fingerprints and descriptors
+        ecfps = create_ecfp_fps(mols)
+        # expand using mol_mask
+        ecfps = fill(ecfps, ~clean_mol_mask)
+        print("Created ECFP fingerprints")
+        rdkit_descrs = create_rdkit_descriptors(mols)
+        print("Created RDKit descriptors")
+        # Create and save ecdfs
+        if split == "train":
+            print("Create ECDFs")
+            ecdfs = []
+            for column in range(rdkit_descrs.shape[1]):
+                raw_values = rdkit_descrs[:, column].reshape(-1)
+                ecdfs.append(ECDF(raw_values))
+            write_pickle(args.path_ecdfs, ecdfs)
+            print(f"Saved ECDFs under {args.path_ecdfs}")
+        else:
+            print(f"Load ECDFs from {args.path_ecdfs}")
+            ecdfs = load_pickle(args.path_ecdfs)
+        # Create quantiles
+        rdkit_descr_quantiles = create_quantiles(rdkit_descrs, ecdfs)
+        # expand using mol_mask
+        rdkit_descr_quantiles = fill(rdkit_descr_quantiles, ~clean_mol_mask)
+        print("Created quantiles of RDKit descriptors")
+        maccs = create_maccs_keys(mols)
+        maccs = fill(maccs, ~clean_mol_mask)
+        print("Created MACCS keys")
+        tox = create_tox_features(mols, tox_patterns)
+        tox = fill(tox, ~clean_mol_mask)
+        print("Created Tox features")
+        labels = []
+        for task in TASKS:
+            datasplit = ds[split].to_pandas() if args.use_hf else ds[split]
+            labels.append(datasplit[task].to_numpy())
+        labels = np.stack(labels, axis=1)
+        save_path = os.path.join(args.save_folder, f"tox21_{split}.npz")
+        with open(save_path, "wb") as f:
+            np.savez(
+                f,
+                labels=labels,
+                ecfps=ecfps,
+                rdkit_descr_quantiles=rdkit_descr_quantiles,
+                maccs=maccs,
+                tox=tox,
+            )
+            print(f"Saved preprocessed {split} split under {save_path}")
+    print("Preprocessing finished successfully")
+if __name__ == "__main__":
+    args = parser.parse_args()
+    if not os.path.exists(args.save_folder):
+        os.makedirs(args.save_folder)
+    if not os.path.exists(os.path.dirname(args.path_ecdfs)):
+        os.makedirs(os.path.dirname(args.path_ecdfs))
+    main(args)

src/train.py CHANGED Viewed

@@ -2,17 +2,19 @@
 Script for fitting and saving any preprocessing assets, as well as the fitted XGBoost model
 """
 import argparse
 import numpy as np
 from tabulate import tabulate
-from datasets import load_dataset
 from sklearn.metrics import roc_auc_score
-from .data import preprocess_molecules
 from .model import Tox21XGBClassifier
-from .utils import HF_TOKEN
 parser = argparse.ArgumentParser(description="XGBoost Trainig script for Tox21 dataset")
@@ -36,51 +38,156 @@ parser.add_argument(
 def main(args):
-    ds = load_dataset("tschouis/tox21", token=HF_TOKEN)
     print("Preprocess train molecules")
-    train_smiles = list(ds["train"]["smiles"])
-    train_features, train_mol_mask = preprocess_molecules(
-        train_smiles,
-        save_ecdf_path=args.path_ecdfs,
-        save_scaler_path=args.path_scaler,
     )
-    print("Preprocess validation molecules")
-    val_smiles = list(ds["validation"]["smiles"])
-    val_features, val_mol_mask = preprocess_molecules(
-        val_smiles,
-        load_ecdf_path=args.path_ecdfs,
-        load_scaler_path=args.path_scaler,
     )
-    model = Tox21XGBClassifier(seed=42)
     print("Start training.")
-    for task in model.tasks:
-        task_labels = ds["train"].to_pandas()[task].to_numpy()
-        task_labels = task_labels[train_mol_mask]
         label_mask = ~np.isnan(task_labels)
         print(f"Fit task {task} using {sum(label_mask)} samples")
-        model.fit(task, train_features[label_mask], task_labels[label_mask].astype(int))
     print(f"Save model under {args.save_path_model}")
     model.save_model(args.save_path_model)
     print("Evaluate model")
     results = {}
-    for task in model.tasks:
-        task_labels = ds["validation"].to_pandas()[task].to_numpy()
-        task_labels = task_labels[val_mol_mask]
         label_mask = ~np.isnan(task_labels)
-        pred = model.predict(task, val_features[label_mask])
-        results[task] = [
-            roc_auc_score(y_true=task_labels[label_mask].astype(int), y_score=pred)
-        ]
     print("Results:")
     print(tabulate(results, headers="keys"))

 Script for fitting and saving any preprocessing assets, as well as the fitted XGBoost model
 """
+import os
 import argparse
 import numpy as np
 from tabulate import tabulate
 from sklearn.metrics import roc_auc_score
+from .data import get_descriptor_dataset
 from .model import Tox21XGBClassifier
+SEED = 42
+DATA_FOLDER = "data/"
 parser = argparse.ArgumentParser(description="XGBoost Trainig script for Tox21 dataset")
 def main(args):
     print("Preprocess train molecules")
+    # load datasets
+    train_X, train_y, scaler = get_descriptor_dataset(
+        os.path.join(DATA_FOLDER, "tox21_train.npz"),
+        descriptors="all",
+        save_scaler_path="data/scaler.pkl",
     )
+    val_X, val_y, _ = get_descriptor_dataset(
+        os.path.join(DATA_FOLDER, "tox21_validation.npz"),
+        descriptors="all",
+        scaler=scaler,
     )
+    task_config = {
+        "NR-AR": {
+            "colsample_bytree": 0.5,
+            "learning_rate": 0.05,
+            "max_depth": 12,
+            "min_child_weight": 2,
+            "n_estimators": 1000,
+            "scale_pos_weight": 80,
+            "subsample": 0.4,
+        },
+        "NR-AR-LBD": {
+            "colsample_bytree": 0.8,
+            "learning_rate": 0.04,
+            "max_depth": 10,
+            "min_child_weight": 8,
+            "n_estimators": 1000,
+            "scale_pos_weight": 10,
+            "subsample": 0.4,
+        },
+        "NR-AhR": {
+            "colsample_bytree": 0.8,
+            "learning_rate": 0.05,
+            "max_depth": 16,
+            "min_child_weight": 2,
+            "n_estimators": 1000,
+            "scale_pos_weight": 80,
+            "subsample": 1,
+        },
+        "NR-Aromatase": {
+            "colsample_bytree": 0.7,
+            "learning_rate": 0.05,
+            "max_depth": 16,
+            "min_child_weight": 1,
+            "n_estimators": 1000,
+            "scale_pos_weight": 50,
+            "subsample": 0.7,
+        },
+        "NR-ER": {
+            "colsample_bytree": 0.7,
+            "learning_rate": 0.05,
+            "max_depth": 10,
+            "min_child_weight": 4,
+            "n_estimators": 1000,
+            "scale_pos_weight": 25,
+            "subsample": 0.4,
+        },
+        "NR-ER-LBD": {
+            "colsample_bytree": 0.7,
+            "learning_rate": 0.05,
+            "max_depth": 16,
+            "min_child_weight": 4,
+            "n_estimators": 1000,
+            "scale_pos_weight": 10,
+            "subsample": 0.4,
+        },
+        "NR-PPAR-gamma": {
+            "colsample_bytree": 0.8,
+            "learning_rate": 0.01,
+            "max_depth": 12,
+            "min_child_weight": 2,
+            "n_estimators": 1000,
+            "scale_pos_weight": 80,
+            "subsample": 0.4,
+        },
+        "SR-ARE": {
+            "colsample_bytree": 0.7,
+            "learning_rate": 0.05,
+            "max_depth": 16,
+            "min_child_weight": 8,
+            "n_estimators": 1000,
+            "scale_pos_weight": 10,
+            "subsample": 0.7,
+        },
+        "SR-ATAD5": {
+            "colsample_bytree": 0.5,
+            "learning_rate": 0.02,
+            "max_depth": 12,
+            "min_child_weight": 8,
+            "n_estimators": 1000,
+            "scale_pos_weight": 10,
+            "subsample": 0.4,
+        },
+        "SR-HSE": {
+            "colsample_bytree": 0.8,
+            "learning_rate": 0.02,
+            "max_depth": 6,
+            "min_child_weight": 1,
+            "n_estimators": 1000,
+            "scale_pos_weight": 25,
+            "subsample": 1,
+        },
+        "SR-MMP": {
+            "colsample_bytree": 0.5,
+            "learning_rate": 0.02,
+            "max_depth": 16,
+            "min_child_weight": 2,
+            "n_estimators": 1000,
+            "scale_pos_weight": 10,
+            "subsample": 0.7,
+        },
+        "SR-p53": {
+            "colsample_bytree": 0.5,
+            "learning_rate": 0.02,
+            "max_depth": 12,
+            "min_child_weight": 8,
+            "n_estimators": 1000,
+            "scale_pos_weight": 10,
+            "subsample": 0.4,
+        },
+    }
+    model = Tox21XGBClassifier(seed=42, task_config=task_config)
     print("Start training.")
+    for i, task in enumerate(model.tasks):
+        task_labels = train_y[:, i]
         label_mask = ~np.isnan(task_labels)
+        task_data = train_X[label_mask]
+        task_labels = task_labels[label_mask].astype(int)
         print(f"Fit task {task} using {sum(label_mask)} samples")
+        model.fit(task, task_data, task_labels)
     print(f"Save model under {args.save_path_model}")
     model.save_model(args.save_path_model)
     print("Evaluate model")
     results = {}
+    for i, task in enumerate(model.tasks):
+        task_labels = val_y[:, i]
         label_mask = ~np.isnan(task_labels)
+        task_data = val_X[label_mask]
+        task_labels = task_labels[label_mask].astype(int)
+        pred = model.predict(task, task_data)
+        results[task] = [roc_auc_score(y_true=task_labels, y_score=pred)]
     print("Results:")
     print(tabulate(results, headers="keys"))

src/utils.py CHANGED Viewed

@@ -28,6 +28,8 @@ TASKS = [
     "SR-p53",
 ]
 USED_200_DESCR = [
     0,
     1,

     "SR-p53",
 ]
+KNOWN_DESCR = ["ecfps", "rdkit_descr_quantiles", "maccs", "tox"]
 USED_200_DESCR = [
     0,
     1,