Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import joblib | |
| import plotly.graph_objects as go | |
| from sklearn.base import BaseEstimator, ClassifierMixin | |
| from sklearn.preprocessing import RobustScaler, LabelEncoder | |
| from sklearn.feature_selection import SelectFromModel | |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier | |
| import xgboost as xgb | |
| from sklearn.linear_model import LogisticRegression | |
| import time | |
| from datetime import datetime | |
| class OptimizedStackedClassifier(BaseEstimator, ClassifierMixin): | |
| def __init__(self): | |
| self.scaler = RobustScaler() | |
| self.label_encoder = LabelEncoder() | |
| self.feature_selector = None | |
| self.base_models = None | |
| self.meta_model = None | |
| self.selected_features = None | |
| self.start_time = time.time() | |
| def predict(self, X): | |
| """Make predictions using optimized pipeline""" | |
| # Scale and select features | |
| X_scaled = pd.DataFrame( | |
| self.scaler.transform(X), | |
| columns=X.columns | |
| ) | |
| X_selected = X_scaled[self.selected_features] | |
| # Generate meta-features | |
| meta_features = np.zeros((X_selected.shape[0], len(self.base_models) * 6)) | |
| for i, (name, model) in enumerate(self.base_models): | |
| predictions = model.predict_proba(X_selected) | |
| meta_features[:, i*6:(i+1)*6] = predictions | |
| # Make final predictions | |
| predictions = self.meta_model.predict(meta_features) | |
| return self.label_encoder.inverse_transform(predictions) | |
| def predict_proba(self, X): | |
| """Get prediction probabilities""" | |
| # Scale and select features | |
| X_scaled = pd.DataFrame( | |
| self.scaler.transform(X), | |
| columns=X.columns | |
| ) | |
| X_selected = X_scaled[self.selected_features] | |
| # Generate meta-features | |
| meta_features = np.zeros((X_selected.shape[0], len(self.base_models) * 6)) | |
| for i, (name, model) in enumerate(self.base_models): | |
| predictions = model.predict_proba(X_selected) | |
| meta_features[:, i*6:(i+1)*6] = predictions | |
| return self.meta_model.predict_proba(meta_features) | |
| def load_model(model_path): | |
| """Load the saved model""" | |
| try: | |
| return joblib.load(model_path) | |
| except Exception as e: | |
| st.error(f"Error loading model: {str(e)}") | |
| return None | |
| def create_features(input_data): | |
| """Create features matching the model's exact feature names""" | |
| features = { | |
| 'chars_original': input_data['chars_original'], | |
| 'chars_tokenized': input_data['chars_tokenized'], | |
| 'num_words': input_data['num_words'], | |
| 'num_tokens': input_data['num_tokens'], | |
| 'unique_tokens': input_data['unique_tokens'], | |
| 'type_token_ratio': input_data['type_token_ratio'], | |
| 'fertility': input_data['fertility'], | |
| 'token_std': input_data['token_std'], | |
| 'avg_token_len': input_data['avg_token_len'] | |
| } | |
| # Add derived features | |
| eps = 1e-10 | |
| features['chars_per_word'] = features['chars_original'] / (features['num_words'] + eps) | |
| features['chars_per_token'] = features['chars_tokenized'] / (features['num_tokens'] + eps) | |
| features['tokens_per_word'] = features['num_tokens'] / (features['num_words'] + eps) | |
| features['token_complexity'] = features['token_std'] * features['avg_token_len'] | |
| features['lexical_density'] = features['unique_tokens'] / (features['num_words'] + eps) | |
| features['log_chars'] = np.log1p(features['chars_original']) | |
| features['complexity_score'] = ( | |
| features['token_complexity'] * | |
| features['lexical_density'] * | |
| features['type_token_ratio'] | |
| ) | |
| return pd.DataFrame([features]) | |
| def plot_probabilities(probabilities): | |
| """Create a bar plot of prediction probabilities""" | |
| fig = go.Figure(data=[ | |
| go.Bar( | |
| x=[f'Level {i+1}' for i in range(len(probabilities))], | |
| y=probabilities, | |
| text=np.round(probabilities, 3), | |
| textposition='auto' | |
| ) | |
| ]) | |
| fig.update_layout( | |
| title='Probability Distribution Across Readability Levels', | |
| xaxis_title='Readability Level', | |
| yaxis_title='Probability', | |
| yaxis_range=[0, 1], | |
| height=400 | |
| ) | |
| return fig | |
| def plot_feature_values(features_df): | |
| """Create a bar plot of feature values""" | |
| fig = go.Figure(data=[ | |
| go.Bar( | |
| x=features_df.columns, | |
| y=features_df.values[0], | |
| text=np.round(features_df.values[0], 2), | |
| textposition='auto' | |
| ) | |
| ]) | |
| fig.update_layout( | |
| title='Feature Values', | |
| xaxis_title='Features', | |
| yaxis_title='Value', | |
| xaxis_tickangle=-45, | |
| height=500 | |
| ) | |
| return fig | |
| def main(): | |
| st.set_page_config(page_title="Text Readability Classifier", layout="wide") | |
| st.title("Text Readability Classifier") | |
| st.write("This app predicts the readability level based on text characteristics.") | |
| # Load the model | |
| model_path = "model.joblib" | |
| model = load_model(model_path) | |
| if model is None: | |
| st.error("Could not load the model. Please check if the model file exists.") | |
| return | |
| # Create two columns for layout | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| # Input form for text characteristics | |
| st.subheader("Enter Text Characteristics") | |
| # Basic features input | |
| input_data = {} | |
| input_data['chars_original'] = st.number_input('Number of Characters (Original)', value=0) | |
| input_data['chars_tokenized'] = st.number_input('Number of Characters (Tokenized)', value=0) | |
| input_data['num_words'] = st.number_input('Number of Words', value=0) | |
| input_data['num_tokens'] = st.number_input('Number of Tokens', value=0) | |
| input_data['unique_tokens'] = st.number_input('Number of Unique Tokens', value=0) | |
| input_data['type_token_ratio'] = st.number_input('Type-Token Ratio', value=0.0, min_value=0.0, max_value=1.0) | |
| input_data['fertility'] = st.number_input('Fertility', value=0.0) | |
| input_data['token_std'] = st.number_input('Token Standard Deviation', value=0.0) | |
| input_data['avg_token_len'] = st.number_input('Average Token Length', value=0.0) | |
| analyze_button = st.button("Analyze", type="primary") | |
| if analyze_button: | |
| with st.spinner("Analyzing..."): | |
| try: | |
| # Create features dataframe with all required features | |
| features_df = create_features(input_data) | |
| # Make prediction | |
| prediction = model.predict(features_df)[0] | |
| probabilities = model.predict_proba(features_df)[0] | |
| # Display results | |
| st.subheader("Analysis Results") | |
| # Create metrics row | |
| metrics_cols = st.columns(2) | |
| with metrics_cols[0]: | |
| st.metric("Readability Level", f"Level {prediction}") | |
| with metrics_cols[1]: | |
| highest_prob = max(probabilities) | |
| st.metric("Confidence", f"{highest_prob:.2%}") | |
| # Show probability distribution | |
| st.plotly_chart(plot_probabilities(probabilities), | |
| use_container_width=True) | |
| # Show all feature values including derived features | |
| st.subheader("All Features (Including Derived)") | |
| st.plotly_chart(plot_feature_values(features_df), | |
| use_container_width=True) | |
| except Exception as e: | |
| st.error(f"Error during analysis: {str(e)}") | |
| with col2: | |
| # Information sidebar | |
| with st.container(): | |
| st.subheader("About Readability Levels") | |
| st.write(""" | |
| The model predicts readability on a scale from 1 to 6: | |
| - **Level 1**: Very Easy | |
| - **Level 2**: Easy | |
| - **Level 3**: Moderately Easy | |
| - **Level 4**: Moderate | |
| - **Level 5**: Moderately Difficult | |
| - **Level 6**: Difficult | |
| """) | |
| st.subheader("Feature Explanations") | |
| st.write(""" | |
| **Basic Features:** | |
| - Character counts (original and tokenized) | |
| - Word and token counts | |
| - Type-token ratio (vocabulary diversity) | |
| - Token length statistics | |
| **Derived Features:** | |
| - Characters per word/token | |
| - Token complexity | |
| - Lexical density | |
| - Overall complexity score | |
| """) | |
| st.subheader("Model Performance") | |
| st.write(""" | |
| This model achieves: | |
| - **Accuracy**: 73.86% | |
| - **Macro Avg F1**: 0.75 | |
| - **Weighted Avg F1**: 0.74 | |
| *Note: Results should be used as guidance rather than absolute measures.* | |
| """) | |
| if __name__ == "__main__": | |
| main() |