Spaces:

omasteam
/

Preprocessing-Solver

Sleeping

App Files Files Community

omasteam commited on Nov 7

Commit

133ca9b

verified ·

1 Parent(s): 098ef71

Create App.py

Browse files

Files changed (1) hide show

App.py +132 -0

App.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# app.py - Deploy to Hugging Face Space (New → Gradio → Paste this)
+import gradio as gr
+import pandas as pd
+import numpy as np
+from typing import Tuple, Dict, Any
+import io
+import base64
+import warnings
+warnings.filterwarnings('ignore')
+class DSPreprocessor:
+    """Auto-fixes the 5 things that waste your time"""
+    def __init__(self):
+        self.report = {"actions": [], "warnings": [], "stats": {}}
+    def fit_transform(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]:
+        # 1. Memory Killer: Downcast numeric types (50-90% RAM savings)
+        start_mem = df.memory_usage(deep=True).sum() / 1024**2
+        for col in df.select_dtypes(include=['int64', 'float64']).columns:
+            col_type = df[col].dtype
+            try:
+                if col_type == 'int64':
+                    df[col] = pd.to_numeric(df[col], downcast='integer')
+                else:
+                    df[col] = pd.to_numeric(df[col], downcast='float')
+                if df[col].dtype != col_type:
+                    self.report["actions"].append(f"✓ {col}: {col_type} → {df[col].dtype}")
+            except:
+                pass
+        # 2. DateTime Hell: Auto-detect and parse (handles 3 formats in one column)
+        for col in df.select_dtypes(include=['object']).columns:
+            try:
+                # Try parsing if >30% looks like dates
+                parsed = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
+                if parsed.notnull().sum() > len(df) * 0.3:
+                    df[col] = parsed
+                    self.report["actions"].append(f"✓ {col}: Parsed datetime ({parsed.notnull().sum()} valid)")
+            except:
+                pass
+        # 3. Categorical Explosion: Hash high-cardinality strings (prevents memory blowup)
+        for col in df.select_dtypes(include=['object']).columns:
+            n_unique = df[col].nunique()
+            if n_unique > len(df) * 0.5:
+                df[col] = df[col].astype('category').cat.codes
+                self.report["warnings"].append(
+                    f"⚠️ {col}: {n_unique:,} unique values → Hashed to codes (category leak risk)"
+                )
+        # 4. Missing Target Leakage: Flag if missingness correlates with any column
+        missing_corr = df.isnull().corr()
+        high_corr = missing_corr[missing_corr.abs() > 0.9].stack().reset_index()
+        high_corr = high_corr[high_corr['level_0'] != high_corr['level_1']]
+        if not high_corr.empty:
+            for _, row in high_corr.iterrows():
+                self.report["warnings"].append(
+                    f"⚠️ Missingness correlation: {row['level_0']} ↔ {row['level_1']} (r={row[0]:.2f})"
+                )
+        # 5. Silent Failures: Detect constant columns (screw up scaling)
+        constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
+        if constant_cols:
+            self.report["warnings"].append(f"⚠️ Constant columns (drop these): {constant_cols}")
+        # Final stats
+        end_mem = df.memory_usage(deep=True).sum() / 1024**2
+        self.report["stats"] = {
+            "Memory saved": f"{start_mem - end_mem:.1f} MB ({100*(1-end_mem/start_mem):.0f}%)",
+            "Rows": len(df),
+            "Columns": len(df.columns),
+            "Dtypes optimized": len([a for a in self.report["actions"] if "→" in a])
+        }
+        return df, self.report
+def process_file(file_obj, target_col: str = "") -> Tuple[pd.DataFrame, Dict, str]:
+    """Main function for Gradio"""
+    if file_obj is None:
+        return None, None, "Upload a CSV first"
+    df = pd.read_csv(file_obj.name)
+    preprocessor = DSPreprocessor()
+    # Optional target column for leakage check
+    if target_col and target_col in df.columns:
+        # Move target to end for clarity
+        df = df[[c for c in df.columns if c != target_col] + [target_col]]
+    cleaned_df, report = preprocessor.fit_transform(df)
+    # Create download link
+    csv_bytes = cleaned_df.to_csv(index=False).encode()
+    b64 = base64.b64encode(csv_bytes).decode()
+    href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_data.csv">Download Cleaned CSV</a>'
+    return cleaned_df, report, href
+# UI (Gradio)
+with gr.Blocks(title="DS Preprocessor Pro") as demo:
+    gr.Markdown("## 🚀 Data Science Preprocessor Pro\nUpload a messy CSV. Get back clean data + audit report.")
+    with gr.Row():
+        file_input = gr.File(label="Upload CSV", file_types=[".csv"])
+        target_input = gr.Textbox(label="Target column (optional)", placeholder="e.g., price")
+    with gr.Row():
+        go_btn = gr.Button("🔥 Clean My Data", variant="primary", size="lg")
+    with gr.Tabs():
+        with gr.TabItem("Cleaned Data"):
+            data_output = gr.Dataframe()
+        with gr.TabItem("Audit Report"):
+            report_output = gr.JSON()
+        with gr.TabItem("Download"):
+            download_html = gr.HTML()
+    # Magic happens here
+    go_btn.click(
+        fn=process_file,
+        inputs=[file_input, target_input],
+        outputs=[data_output, report_output, download_html]
+    )
+    gr.Examples(
+        examples=["sample_messy_data.csv"],  # Create a sample file in your Space
+        inputs=[file_input]
+    )
+demo.launch()