Spaces:

ml-jku
/

tox21_rf_classifier

Sleeping

App Files Files Community

antoniaebner commited on Nov 11

Commit

28424e6

1 Parent(s): 3057490

update requirements and add preprocessing

Browse files

Files changed (5) hide show

data/tox_smarts.json +0 -0
preprocess.py +193 -0
requirements.txt +3 -3
src/data.py +313 -74
src/utils.py +9 -0

data/tox_smarts.json ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocess.py ADDED Viewed

	@@ -0,0 +1,193 @@

+# pipeline taken from https://huggingface.co/spaces/ml-jku/mhnfs/blob/main/src/data_preprocessing/create_descriptors.py
+"""
+This files includes a the data processing for Tox21.
+As an input it takes a list of SMILES and it outputs a nested dictionary with
+SMILES and target names as keys.
+"""
+import os
+import argparse
+import numpy as np
+from src.data import create_descriptors, get_tox21_split
+from src.utils import (
+    TASKS,
+    HF_TOKEN,
+    write_pickle,
+    create_dir,
+)
+parser = argparse.ArgumentParser(
+    description="Data preprocessing script for the Tox21 dataset"
+)
+parser.add_argument(
+    "--save_folder",
+    type=str,
+    default="data/",
+    help="Folder to which preprocessed the data CSV and NPZ files should be saved.",
+)
+parser.add_argument(
+    "--cv_fold",
+    type=int,
+    default=4,
+    help="Select fold used as validation set.",
+)
+parser.add_argument(
+    "--feature_selection",
+    type=int,
+    default=1,
+    help="True (=1) to use feature selection.",
+)
+parser.add_argument(
+    "--feature_selection_path",
+    type=str,
+    default="feat_selection.npz",
+    help="Filename for saving feature selections.",
+)
+parser.add_argument(
+    "--min_var",
+    type=float,
+    default=0.01,
+    help="Minimum variance threshold for selecting features.",
+)
+parser.add_argument(
+    "--max_corr",
+    type=float,
+    default=0.95,
+    help="Maximum correlation threshold for selecting features.",
+)
+parser.add_argument(
+    "--ecdfs_path",
+    type=str,
+    default="ecdfs.pkl",
+    help="Filename to save ECDFs.",
+)
+parser.add_argument(
+    "--ecfps_radius",
+    type=int,
+    default=3,
+    help="Radius used for creating ECFPs.",
+)
+parser.add_argument(
+    "--ecfps_folds",
+    type=int,
+    default=8192,
+    help="Folds used for creating ECFPs.",
+)
+parser.add_argument(
+    "--ecdfs",
+    type=int,
+    default=1,
+    help="True (=1) to use ECDFs for creating quantiles of the RDKit descriptors.",
+)
+def main(args):
+    """Preprocessing train/val data to use for TabPFN.
+    1. Download Tox21 train/val data from HF
+    2. Preprocess dataset splits
+    """
+    ds = get_tox21_split(HF_TOKEN, cvfold=args.cv_fold)
+    feature_creation_kwargs = {
+        "radius": args.ecfps_radius,
+        "fpsize": args.ecfps_folds,
+        "min_var": args.min_var,
+        "max_corr": args.max_corr,
+    }
+    removed_mols = 0
+    splits = ["train", "validation", "test"]
+    for split in splits:
+        print(f"Preprocess {split} molecules")
+        if split != "test":
+            ds_split = ds[split]
+            smiles = list(ds_split["smiles"])
+        else:
+            import pandas as pd
+            ds_split = pd.read_csv("data/tox21_test_cv4.csv")
+            smiles = ds_split["smiles"]
+        features, clean_mol_mask = create_descriptors(smiles, **feature_creation_kwargs)
+        # if split == "train":
+        #     output = create_descriptors(
+        #         smiles,
+        #         return_feature_selection=True,
+        #         return_ecdfs=True,
+        #         **feature_creation_kwargs,
+        #     )
+        #     features = output.pop("features")
+        #     if args.feature_selection:
+        #         feature_selection = output.pop("feature_selection")
+        #         np.savez(
+        #             args.feature_selection_path,
+        #             ecfps_selec=feature_selection["ecfps_selec"],
+        #             tox_selec=feature_selection["tox_selec"],
+        #         )
+        #         print(f"Saved feature selection under {args.feature_selection_path}")
+        #     if args.ecdfs:
+        #         ecdfs = output.pop("ecdfs")
+        #         write_pickle(args.ecdfs_path, ecdfs)
+        #         print(f"Saved ECDFs under {args.ecdfs_path}")
+        # else:
+        #     features = create_descriptors(
+        #         smiles,
+        #         ecdfs=ecdfs,
+        #         feature_selection=feature_selection,
+        #         **feature_creation_kwargs,
+        #     )["features"]
+        removed_mols += (~clean_mol_mask).sum()
+        labels = []
+        for task in TASKS:
+            labels.append(ds_split[task].to_numpy())
+        labels = np.stack(labels, axis=1)
+        save_path = os.path.join(args.save_folder, f"tox21_{split}_cv4.npz")
+        with open(save_path, "wb") as f:
+            np.savez(
+                f,
+                labels=labels[clean_mol_mask, :],
+                features=features,
+                # **features,
+            )
+            print(f"Saved preprocessed {split} split under {save_path}")
+    print(f"{removed_mols} mols were removed during cleaning across all datasets")
+    print("Preprocessing finished successfully")
+if __name__ == "__main__":
+    args = parser.parse_args()
+    # args.ecdfs_path = os.path.join(args.save_folder, args.ecdfs_path)
+    # args.feature_selection_path = os.path.join(
+    #     args.save_folder, args.feature_selection_path
+    # )
+    create_dir(args.save_folder)
+    # create_dir(args.ecdfs_path, is_file=True)
+    # create_dir(args.feature_selection_path, is_file=True)
+    main(args)

requirements.txt CHANGED Viewed

@@ -1,10 +1,10 @@
 fastapi
 uvicorn[standard]
 statsmodels
-rdkit
-numpy
 scikit-learn==1.7.1
-joblib
 tabulate
 datasets
 torch==2.8.0

 fastapi
 uvicorn[standard]
 statsmodels
+rdkit==2025.09.1
+numpy==2.3.3
 scikit-learn==1.7.1
+joblib==1.5.2
 tabulate
 datasets
 torch==2.8.0

src/data.py CHANGED Viewed

@@ -6,85 +6,324 @@ As an input it takes a list of SMILES and it outputs a nested dictionary with
 SMILES and target names as keys.
 """
-from typing import Iterable, Literal
 import numpy as np
-import torch
-from .preprocess import normalize_features
-KNOWN_DESCR = ["ecfps", "rdkit_descr_quantiles", "maccs", "tox"]
-def get_descriptor_dataset(
-    data_path: str,
-    descriptors: Iterable[str] | Literal["all"],
-    scaler=None,
-    save_scaler_path: str = "data/scaler.pkl",
-    verbose=True,
-    normalize=True,
-):
-    if descriptors == "all":
-        descriptors = KNOWN_DESCR
-    assert isinstance(descriptors, Iterable), "Passed descriptors are not iterable!"
-    assert all(
-        [descr in KNOWN_DESCR for descr in descriptors]
-    ), f"Passed descriptors contains unknown descriptor types. Allowed descriptors: {KNOWN_DESCR}"
-    datafile = np.load(data_path)
-    if not isinstance(datafile, np.ndarray):
-        # concatenate all descriptors and normalize
-        data = np.concatenate([datafile[descr] for descr in descriptors], axis=1)
-        labels = datafile["labels"]
-    else:
-        print("NPY file passed, cannot select specific descriptors")
-        data, labels = datafile[:, :-12], datafile[:, -12:]
-    if normalize:
-        data, scaler = normalize_features(
-            data,
-            scaler=scaler,
-            save_scaler_path=save_scaler_path,
-            verbose=verbose,
         )
-    # filter out unsanitized molecules
-    mask = ~np.isnan(data).any(axis=1)
-    data = data[mask]
-    labels = labels[mask]
-    assert data.shape[0] == labels.shape[0], (
-        f"Mismatch between data and labels: "
-        f"data has {data.shape[0]} samples, but labels has {labels.shape[0]} samples."
-    )
-    return (data, labels, scaler)
-def get_torch_descriptor_dataset(
-    data_path: str,
-    descriptors: list[str],
-    scaler=None,
-    save_scaler_path: str = "data/scaler.pkl",
-    nan_to_num: int = -100,
-    verbose=True,
-    normalize=True,
-) -> torch.utils.data.TensorDataset:
-    data, labels, scaler = get_descriptor_dataset(
-        data_path,
-        descriptors,
-        scaler,
-        save_scaler_path,
-        verbose=verbose,
-        normalize=normalize,
-    )
-    labels = np.nan_to_num(labels, nan=nan_to_num)
-    dataset = torch.utils.data.TensorDataset(
-        torch.FloatTensor(data), torch.LongTensor(labels)
-    )
-    return dataset, scaler

 SMILES and target names as keys.
 """
+import json
 import numpy as np
+import pandas as pd
+from datasets import load_dataset
+from sklearn.feature_selection import VarianceThreshold
+from statsmodels.distributions.empirical_distribution import ECDF
+from rdkit import Chem, DataStructs
+from rdkit.Chem import Descriptors, rdFingerprintGenerator, MACCSkeys
+from rdkit.Chem.rdchem import Mol
+from .utils import (
+    USED_200_DESCR,
+    TOX_SMARTS_PATH,
+    Standardizer,
+)
+def create_cleaned_mol_objects(smiles: list[str]) -> tuple[list[Mol], np.ndarray]:
+    """This function creates cleaned RDKit mol objects from a list of SMILES.
+    Args:
+        smiles (list[str]): list of SMILES
+    Returns:
+        list[Mol]: list of cleaned molecules
+        np.ndarray[bool]: mask that contains False at index `i`, if molecule in `smiles` at
+            index `i` could not be cleaned and was removed.
+    """
+    sm = Standardizer(canon_taut=True)
+    clean_mol_mask = list()
+    mols = list()
+    for i, smile in enumerate(smiles):
+        mol = Chem.MolFromSmiles(smile)
+        standardized_mol, _ = sm.standardize_mol(mol)
+        is_cleaned = standardized_mol is not None
+        clean_mol_mask.append(is_cleaned)
+        if not is_cleaned:
+            continue
+        can_mol = Chem.MolFromSmiles(Chem.MolToSmiles(standardized_mol))
+        mols.append(can_mol)
+    return mols, np.array(clean_mol_mask)
+def create_ecfp_fps(mols: list[Mol], radius=3, fpsize=2048, **kwargs) -> np.ndarray:
+    """This function ECFP fingerprints for a list of molecules.
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: ECFP fingerprints of molecules
+    """
+    ecfps = list()
+    for mol in mols:
+        gen = rdFingerprintGenerator.GetMorganGenerator(
+            countSimulation=True, fpSize=fpsize, radius=radius
         )
+        fp_sparse_vec = gen.GetCountFingerprint(mol)
+        fp = np.zeros((0,), np.int8)
+        DataStructs.ConvertToNumpyArray(fp_sparse_vec, fp)
+        ecfps.append(fp)
+    return np.array(ecfps)
+def create_maccs_keys(mols: list[Mol]) -> np.ndarray:
+    maccs = [MACCSkeys.GenMACCSKeys(x) for x in mols]
+    return np.array(maccs)
+def get_tox_patterns(filepath: str):
+    """This calculates tox features defined in tox_smarts.json.
+    Args:
+        mols: A list of Mol
+        n_jobs: If >1 multiprocessing is used
+    """
+    # load patterns
+    with open(filepath) as f:
+        smarts_list = [s[1] for s in json.load(f)]
+    # Code does not work for this case
+    assert len([s for s in smarts_list if ("AND" in s) and ("OR" in s)]) == 0
+    # Chem.MolFromSmarts takes a long time so it pays of to parse all the smarts first
+    # and then use them for all molecules. This gives a huge speedup over existing code.
+    # a list of patterns, whether to negate the match result and how to join them to obtain one boolean value
+    all_patterns = []
+    for smarts in smarts_list:
+        patterns = []  # list of smarts-patterns
+        # value for each of the patterns above. Negates the values of the above later.
+        negations = []
+        if " AND " in smarts:
+            smarts = smarts.split(" AND ")
+            merge_any = False  # If an ' AND ' is found all 'subsmarts' have to match
+        else:
+            # If there is an ' OR ' present it's enough is any of the 'subsmarts' match.
+            # This also accumulates smarts where neither ' OR ' nor ' AND ' occur
+            smarts = smarts.split(" OR ")
+            merge_any = True
+        # for all subsmarts check if they are preceded by 'NOT '
+        for s in smarts:
+            neg = s.startswith("NOT ")
+            if neg:
+                s = s[4:]
+            patterns.append(Chem.MolFromSmarts(s))
+            negations.append(neg)
+        all_patterns.append((patterns, negations, merge_any))
+    return all_patterns
+def create_tox_features(mols: list[Mol], patterns: list) -> np.ndarray:
+    """Matches the tox patterns against a molecule. Returns a boolean array"""
+    tox_data = []
+    for mol in mols:
+        mol_features = []
+        for patts, negations, merge_any in patterns:
+            matches = [mol.HasSubstructMatch(p) for p in patts]
+            matches = [m != n for m, n in zip(matches, negations)]
+            if merge_any:
+                pres = any(matches)
+            else:
+                pres = all(matches)
+            mol_features.append(pres)
+        tox_data.append(np.array(mol_features))
+    return np.array(tox_data)
+def create_rdkit_descriptors(mols: list[Mol]) -> np.ndarray:
+    """This function creates RDKit descriptors for a list of molecules.
+    Args:
+        mols (list[Mol]): list of molecules
+    Returns:
+        np.ndarray: RDKit descriptors of molecules
+    """
+    rdkit_descriptors = list()
+    for mol in mols:
+        descrs = []
+        for _, descr_calc_fn in Descriptors._descList:
+            descrs.append(descr_calc_fn(mol))
+        descrs = np.array(descrs)
+        descrs = descrs[USED_200_DESCR]
+        rdkit_descriptors.append(descrs)
+    return np.array(rdkit_descriptors)
+def create_quantiles(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
+    """Create quantile values for given features using the columns
+    Args:
+        raw_features (np.ndarray): values to put into quantiles
+        ecdfs (list): ECDFs to use
+    Returns:
+        np.ndarray: computed quantiles
+    """
+    quantiles = np.zeros_like(raw_features)
+    for column in range(raw_features.shape[1]):
+        raw_values = raw_features[:, column].reshape(-1)
+        ecdf = ecdfs[column]
+        q = ecdf(raw_values)
+        quantiles[:, column] = q
+    return quantiles
+def fill(features, mask, value=np.nan):
+    n_mols = len(mask)
+    n_features = features.shape[1]
+    data = np.zeros(shape=(n_mols, n_features))
+    data.fill(value)
+    data[~mask] = features
+    return data
+def create_descriptors(
+    smiles,
+    ecdfs=None,
+    feature_selection=None,
+    return_ecdfs=False,
+    return_feature_selection=False,
+    **kwargs,
+):
+    # Create cleanded rdkit mol objects
+    mols, clean_mol_mask = create_cleaned_mol_objects(smiles)
+    print("Cleaned molecules")
+    tox_patterns = get_tox_patterns(TOX_SMARTS_PATH)
+    # Create fingerprints and descriptors
+    ecfps = create_ecfp_fps(mols, **kwargs)
+    # expand using mol_mask
+    # ecfps = fill(ecfps, ~clean_mol_mask)
+    print("Created ECFP fingerprints")
+    # print("ecfps features:", ecfps.shape)
+    tox = create_tox_features(mols, tox_patterns)
+    # tox = fill(tox, ~clean_mol_mask)
+    print("Created Tox features")
+    # print("tox features:", tox.shape)
+    # Create and save feature selection for ecfps and tox
+    # if feature_selection is None:
+    #     print("Create Feature selection")
+    #     ecfps_selec = get_feature_selection(ecfps, **kwargs)
+    #     tox_selec = get_feature_selection(tox, **kwargs)
+    #     feature_selection = {"ecfps_selec": ecfps_selec, "tox_selec": tox_selec}
+    # else:
+    #     ecfps_selec = feature_selection["ecfps_selec"]
+    #     tox_selec = feature_selection["tox_selec"]
+    # ecfps = ecfps[:, ecfps_selec]
+    # tox = tox[:, tox_selec]
+    maccs = create_maccs_keys(mols)
+    # maccs = fill(maccs, ~clean_mol_mask)
+    print("Created MACCS keys")
+    rdkit_descrs = create_rdkit_descriptors(mols)
+    # rdkit_descrs = fill(rdkit_descrs, ~clean_mol_mask)
+    print("Created RDKit descriptors")
+    # # Create and save ecdfs
+    # if ecdfs is None:
+    #     print("Create ECDFs")
+    #     ecdfs = []
+    #     for column in range(rdkit_descrs.shape[1]):
+    #         raw_values = rdkit_descrs[:, column].reshape(-1)
+    #         ecdfs.append(ECDF(raw_values))
+    # # Create quantiles
+    # rdkit_descr_quantiles = create_quantiles(rdkit_descrs, ecdfs)
+    # # expand using mol_mask
+    # rdkit_descr_quantiles = fill(rdkit_descr_quantiles, ~clean_mol_mask)
+    # print("Created quantiles of RDKit descriptors")
+    # concatenate features
+    # features = {
+    #     "ecfps": ecfps,
+    #     "tox": tox,
+    #     "maccs": maccs,
+    #     "rdkit_descr_quantiles": rdkit_descr_quantiles,
+    # }
+    # for feat in [ecfps, tox, maccs, rdkit_descrs]:
+    #     print(feat.shape)
+    features = np.concat((ecfps, tox, maccs, rdkit_descrs), axis=1)
+    # return_dict = {"features": features}
+    # if return_ecdfs:
+    #     return_dict["ecdfs"] = ecdfs
+    # if return_feature_selection:
+    #     return_dict["feature_selection"] = feature_selection
+    return features, clean_mol_mask
+def get_feature_selection(
+    raw_features: np.ndarray, min_var=0.01, max_corr=0.95, **kwargs
+) -> np.ndarray:
+    # select features with at least min_var variation
+    var_thresh = VarianceThreshold(threshold=min_var)
+    feature_selection = var_thresh.fit(raw_features).get_support(indices=True)
+    n_features_preselected = len(feature_selection)
+    # Remove highly correlated features
+    corr_matrix = np.corrcoef(raw_features[:, feature_selection], rowvar=False)
+    upper_tri = np.triu(corr_matrix, k=1)
+    to_keep = np.ones((n_features_preselected,), dtype=bool)
+    for i in range(upper_tri.shape[0]):
+        for j in range(upper_tri.shape[1]):
+            if upper_tri[i, j] > max_corr:
+                to_keep[j] = False
+    feature_selection = feature_selection[to_keep]
+    return feature_selection
+def get_tox21_split(token, cvfold=None):
+    ds = load_dataset("tschouis/tox21", token=token)
+    train_df = ds["train"].to_pandas()
+    val_df = ds["validation"].to_pandas()
+    if cvfold is None:
+        return {"train": train_df, "validation": val_df}
+    combined_df = pd.concat([train_df, val_df], ignore_index=True)
+    cvfold = float(cvfold)
+    # create new splits
+    cvfold = float(cvfold)
+    train_df = combined_df[combined_df.CVfold != cvfold]
+    val_df = combined_df[combined_df.CVfold == cvfold]
+    # exclude train mols that occur in the validation split
+    val_inchikeys = set(val_df["inchikey"])
+    train_df = train_df[~train_df["inchikey"].isin(val_inchikeys)]
+    return {
+        "train": train_df.reset_index(drop=True),
+        "validation": val_df.reset_index(drop=True),
+    }

src/utils.py CHANGED Viewed

@@ -12,6 +12,7 @@ from rdkit import Chem
 from rdkit.Chem.MolStandardize import rdMolStandardize
 HF_TOKEN = os.environ.get("HF_TOKEN")
 TASKS = [
     "NR-AR",
@@ -441,3 +442,11 @@ def load_pickle(path: str):
 def write_pickle(path: str, obj: object):
     with open(path, "wb") as file:
         pickle.dump(obj, file)

 from rdkit.Chem.MolStandardize import rdMolStandardize
 HF_TOKEN = os.environ.get("HF_TOKEN")
+TOX_SMARTS_PATH = "data/tox_smarts.json"
 TASKS = [
     "NR-AR",
 def write_pickle(path: str, obj: object):
     with open(path, "wb") as file:
         pickle.dump(obj, file)
+def create_dir(path, is_file=False):
+    """Creates the parent directories if a path to a file is given, else create the given directory"""
+    to_create = os.path.dirname(path) if is_file else path
+    if not os.path.exists(to_create):
+        os.makedirs(to_create)