Spaces:

MBZUAI-LLM
/

Mobile-MMLU-Challenge

Running

App Files Files Community

SondosMB commited on Dec 20, 2024

Commit

5d5c6ec

verified ·

1 Parent(s): e4f66e8

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -43

app.py CHANGED Viewed

@@ -4,9 +4,6 @@ import os
 import re
 from datetime import datetime
-# Leaderboard Data (example CSV file for leaderboard)
-LEADERBOARD_FILE = "leaderboard.csv"
 def clean_answer(answer):
     if pd.isna(answer):
         return None
@@ -18,21 +15,38 @@ def clean_answer(answer):
             return first_letter
     return None
-def update_leaderboard(results):
-    # Append results to leaderboard file
-    new_entry = {
-        "Model Name": results['model_name'],
-        "Overall Accuracy": f"{results['overall_accuracy']:.2%}",
-        "Valid Accuracy": f"{results['valid_accuracy']:.2%}",
-        "Correct Predictions": results['correct_predictions'],
-        "Total Questions": results['total_questions'],
-        "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-    }
-    leaderboard_df = pd.DataFrame([new_entry])
-    if os.path.exists(LEADERBOARD_FILE):
-        existing_df = pd.read_csv(LEADERBOARD_FILE)
-        leaderboard_df = pd.concat([existing_df, leaderboard_df], ignore_index=True)
-    leaderboard_df.to_csv(LEADERBOARD_FILE, index=False)
 def evaluate_predictions(prediction_file):
     ground_truth_file = "ground_truth.csv"  # Specify the path to the ground truth file
@@ -70,7 +84,6 @@ def evaluate_predictions(prediction_file):
         total_predictions = len(merged_df)
         total_valid_predictions = len(valid_predictions)
-        # Ensure no division by zero
         overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
         valid_accuracy = (
             correct_predictions / total_valid_predictions
@@ -114,30 +127,21 @@ def evaluate_predictions(prediction_file):
     except Exception as e:
         return f"Error during evaluation: {str(e)}", None
-# Gradio Interface with Leaderboard
-def display_leaderboard():
-    if not os.path.exists(LEADERBOARD_FILE):
-        return "Leaderboard is empty."
-    leaderboard_df = pd.read_csv(LEADERBOARD_FILE)
-    return leaderboard_df.to_markdown(index=False)
-demo = gr.Blocks()
-with demo:
-    gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
-    with gr.Tab("Evaluate"):
-        file_input = gr.File(label="Upload Prediction CSV")
-        eval_status = gr.Textbox(label="Evaluation Status")
-        eval_results_file = gr.File(label="Download Evaluation Results")
-        eval_button = gr.Button("Evaluate")
-        eval_button.click(
-            evaluate_predictions, inputs=file_input, outputs=[eval_status, eval_results_file]
-        )
-    with gr.Tab("Leaderboard"):
-        leaderboard_text = gr.Textbox(label="Leaderboard", interactive=False)
-        refresh_button = gr.Button("Refresh Leaderboard")
-        refresh_button.click(display_leaderboard, outputs=leaderboard_text)
 if __name__ == "__main__":
     demo.launch()

 import re
 from datetime import datetime
 def clean_answer(answer):
     if pd.isna(answer):
         return None
             return first_letter
     return None
+def write_evaluation_results(results, output_file):
+    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    output_text = [
+        f"Evaluation Results for Model: {results['model_name']}",
+        f"Timestamp: {timestamp}",
+        "-" * 50,
+        f"Overall Accuracy (including invalid): {results['overall_accuracy']:.2%}",
+        f"Accuracy (valid predictions only): {results['valid_accuracy']:.2%}",
+        f"Total Questions: {results['total_questions']}",
+        f"Valid Predictions: {results['valid_predictions']}",
+        f"Invalid/Malformed Predictions: {results['invalid_predictions']}",
+        f"Correct Predictions: {results['correct_predictions']}",
+        "\nPerformance by Field:",
+        "-" * 50
+    ]
+    for field, metrics in results['field_performance'].items():
+        field_results = [
+            f"\nField: {field}",
+            f"Accuracy (including invalid): {metrics['accuracy']:.2%}",
+            f"Accuracy (valid only): {metrics['valid_accuracy']:.2%}",
+            f"Correct: {metrics['correct']}/{metrics['total']}",
+            f"Invalid predictions: {metrics['invalid']}"
+        ]
+        output_text.extend(field_results)
+    with open(output_file, 'w') as f:
+        f.write('\n'.join(output_text))
+    print('\n'.join(output_text))
+    print(f"\nResults have been saved to: {output_file}")
 def evaluate_predictions(prediction_file):
     ground_truth_file = "ground_truth.csv"  # Specify the path to the ground truth file
         total_predictions = len(merged_df)
         total_valid_predictions = len(valid_predictions)
         overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
         valid_accuracy = (
             correct_predictions / total_valid_predictions
     except Exception as e:
         return f"Error during evaluation: {str(e)}", None
+# Gradio Interface
+description = "Upload a prediction CSV file to evaluate predictions against the ground truth stored in the system."
+demo = gr.Interface(
+    fn=evaluate_predictions,
+    inputs=[
+        gr.File(label="Upload Prediction CSV")
+    ],
+    outputs=[
+        gr.Textbox(label="Evaluation Status"),
+        gr.File(label="Download Evaluation Results")
+    ],
+    title="Prediction Evaluation Tool",
+    description=description
+)
 if __name__ == "__main__":
     demo.launch()