Spaces:
Runtime error
Runtime error
| import numpy as np | |
| from typing import List, Tuple, Any | |
| from sklearn.preprocessing import StandardScaler, RobustScaler | |
| def min_max_normalize_dataset(train_dataset, val_dataset, test_dataset): | |
| """Min-max normalization (0-1 scaling).""" | |
| labels = [e["label"] for e in train_dataset] | |
| min_label, max_label = min(labels), max(labels) | |
| normalized_train_dataset = [] | |
| normalized_val_dataset = [] | |
| normalized_test_dataset = [] | |
| for e in train_dataset: | |
| e["label"] = (e["label"] - min_label) / (max_label - min_label) | |
| normalized_train_dataset.append(e) | |
| for e in val_dataset: | |
| e["label"] = (e["label"] - min_label) / (max_label - min_label) | |
| normalized_val_dataset.append(e) | |
| for e in test_dataset: | |
| e["label"] = (e["label"] - min_label) / (max_label - min_label) | |
| normalized_test_dataset.append(e) | |
| print(normalized_train_dataset[0]) | |
| return normalized_train_dataset, normalized_val_dataset, normalized_test_dataset | |
| def standard_normalize_dataset(train_dataset, val_dataset, test_dataset): | |
| """Z-score normalization (standardization).""" | |
| train_labels = np.array([e["label"] for e in train_dataset]) | |
| mean_label = np.mean(train_labels) | |
| std_label = np.std(train_labels) | |
| normalized_train_dataset = [] | |
| normalized_val_dataset = [] | |
| normalized_test_dataset = [] | |
| for e in train_dataset: | |
| e["label"] = (e["label"] - mean_label) / std_label | |
| normalized_train_dataset.append(e) | |
| for e in val_dataset: | |
| e["label"] = (e["label"] - mean_label) / std_label | |
| normalized_val_dataset.append(e) | |
| for e in test_dataset: | |
| e["label"] = (e["label"] - mean_label) / std_label | |
| normalized_test_dataset.append(e) | |
| return normalized_train_dataset, normalized_val_dataset, normalized_test_dataset | |
| def robust_normalize_dataset(train_dataset, val_dataset, test_dataset): | |
| """Robust scaling using statistics that are robust to outliers.""" | |
| scaler = RobustScaler() | |
| train_labels = np.array([e["label"] for e in train_dataset]).reshape(-1, 1) | |
| scaler.fit(train_labels) | |
| normalized_train_dataset = [] | |
| normalized_val_dataset = [] | |
| normalized_test_dataset = [] | |
| for e in train_dataset: | |
| e["label"] = scaler.transform([[e["label"]]])[0][0] | |
| normalized_train_dataset.append(e) | |
| for e in val_dataset: | |
| e["label"] = scaler.transform([[e["label"]]])[0][0] | |
| normalized_val_dataset.append(e) | |
| for e in test_dataset: | |
| e["label"] = scaler.transform([[e["label"]]])[0][0] | |
| normalized_test_dataset.append(e) | |
| return normalized_train_dataset, normalized_val_dataset, normalized_test_dataset | |
| def log_normalize_dataset(train_dataset, val_dataset, test_dataset, offset=1.0): | |
| """Log normalization, useful for skewed data.""" | |
| normalized_train_dataset = [] | |
| normalized_val_dataset = [] | |
| normalized_test_dataset = [] | |
| for e in train_dataset: | |
| e["label"] = np.log(e["label"] + offset) | |
| normalized_train_dataset.append(e) | |
| for e in val_dataset: | |
| e["label"] = np.log(e["label"] + offset) | |
| normalized_val_dataset.append(e) | |
| for e in test_dataset: | |
| e["label"] = np.log(e["label"] + offset) | |
| normalized_test_dataset.append(e) | |
| return normalized_train_dataset, normalized_val_dataset, normalized_test_dataset | |
| def quantile_normalize_dataset(train_dataset, val_dataset, test_dataset, n_quantiles=1000): | |
| """Quantile normalization to achieve a uniform distribution.""" | |
| from sklearn.preprocessing import QuantileTransformer | |
| transformer = QuantileTransformer(n_quantiles=n_quantiles, output_distribution='uniform') | |
| train_labels = np.array([e["label"] for e in train_dataset]).reshape(-1, 1) | |
| transformer.fit(train_labels) | |
| normalized_train_dataset = [] | |
| normalized_val_dataset = [] | |
| normalized_test_dataset = [] | |
| for e in train_dataset: | |
| e["label"] = transformer.transform([[e["label"]]])[0][0] | |
| normalized_train_dataset.append(e) | |
| for e in val_dataset: | |
| e["label"] = transformer.transform([[e["label"]]])[0][0] | |
| normalized_val_dataset.append(e) | |
| for e in test_dataset: | |
| e["label"] = transformer.transform([[e["label"]]])[0][0] | |
| normalized_test_dataset.append(e) | |
| return normalized_train_dataset, normalized_val_dataset, normalized_test_dataset | |
| def normalize_dataset(train_dataset, val_dataset, test_dataset, method='min_max', **kwargs): | |
| """ | |
| Unified interface for different normalization methods. | |
| Args: | |
| train_dataset: Training dataset | |
| val_dataset: Validation dataset | |
| test_dataset: Test dataset | |
| method: Normalization method ('min_max', 'standard', 'robust', 'log', 'quantile') | |
| **kwargs: Additional arguments for specific normalization methods | |
| Returns: | |
| Normalized datasets (train, val, test) | |
| """ | |
| normalization_methods = { | |
| 'min_max': min_max_normalize_dataset, | |
| 'standard': standard_normalize_dataset, | |
| 'robust': robust_normalize_dataset, | |
| 'log': log_normalize_dataset, | |
| 'quantile': quantile_normalize_dataset | |
| } | |
| if method not in normalization_methods: | |
| raise ValueError(f"Unsupported normalization method: {method}. " | |
| f"Available methods: {list(normalization_methods.keys())}") | |
| return normalization_methods[method](train_dataset, val_dataset, test_dataset, **kwargs) | |