Spaces:

houseaudrey12
/

Assignment8_AHOUSE

Sleeping

App Files Files Community

houseaudrey12 commited on 7 days ago

Commit

fd81a0d

verified ·

1 Parent(s): 1cc4561

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -40

app.py CHANGED Viewed

@@ -1,36 +1,82 @@
 import gradio as gr
 import torch
 import torchaudio
 import numpy as np
 from datasets import load_dataset
-import torch.nn.functional as F
 # ---------------------------
-# Load Dataset for Label Reference
 # ---------------------------
 dataset = load_dataset("ccmusic-database/pianos", name="8_class")
 label_names = dataset["train"].features["label"].names
 # ---------------------------
-# Placeholder Models (will be replaced later with trained models)
 # ---------------------------
-def fake_classify(mel_spec):
-    # Random label for now, just so the app runs
-    return np.random.choice(label_names)
-def fake_quality_score(mel_spec):
-    # Random quality between 1 and 10 for now
-    return round(float(np.random.uniform(1, 10)), 2)
 # ---------------------------
 # Audio Preprocessing
 # ---------------------------
-TARGET_SR = 44100
-N_FFT = 1024
-HOP_LENGTH = 512
-N_MELS = 64
 mel_transform = torchaudio.transforms.MelSpectrogram(
     sample_rate=TARGET_SR,
     n_fft=N_FFT,
@@ -39,14 +85,14 @@ mel_transform = torchaudio.transforms.MelSpectrogram(
     center=False  # we will handle padding manually
 )
-def preprocess_audio(audio):
     """
-    audio from gradio.Audio(type="numpy") is a tuple: (sample_rate, data)
-    data is a NumPy array with shape (samples,) or (samples, channels)
     """
     sr, data = audio
-    # Convert to torch tensor
     waveform = torch.tensor(data, dtype=torch.float32)
     # If shape is (samples,), make it (1, samples)
@@ -57,7 +103,7 @@ def preprocess_audio(audio):
     if waveform.ndim == 2 and waveform.shape[0] < waveform.shape[1]:
         waveform = waveform.transpose(0, 1)
-    # Convert to mono if stereo or more channels
     if waveform.shape[0] > 1:
         waveform = waveform.mean(dim=0, keepdim=True)
@@ -65,43 +111,54 @@ def preprocess_audio(audio):
     if sr != TARGET_SR:
         resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=TARGET_SR)
         waveform = resampler(waveform)
-        sr = TARGET_SR
-    # --- NEW: Ensure minimum length for STFT / MelSpectrogram ---
-    min_len = N_FFT  # at least one window
-    current_len = waveform.shape[-1]
-    if current_len < min_len:
-        pad_amount = min_len - current_len
-        # Pad at the end with zeros
         waveform = F.pad(waveform, (0, pad_amount))
-    # Mel-spectrogram (no internal centering/padding)
-    mel = mel_transform(waveform)
     mel_db = torchaudio.transforms.AmplitudeToDB()(mel)
-    return mel_db
 # ---------------------------
-# Main Analyze Function
 # ---------------------------
 def analyze_piano(audio):
     if audio is None:
-        return "Please upload or record a piano audio clip (at least 1–2 seconds)."
     try:
-        mel = preprocess_audio(audio)
-        # Placeholder predictions (to be replaced with real models later)
-        piano_type = fake_classify(mel)
-        quality_score = fake_quality_score(mel)
         output_text = (
             f"Piano Type Prediction: {piano_type}\n"
-            f"Estimated Sound Quality Score: {quality_score} / 10"
         )
         return output_text
     except Exception as e:
-        # Show error in the UI instead of crashing the app
         return f"An error occurred while processing the audio: {e}"
 # ---------------------------
@@ -116,7 +173,7 @@ demo = gr.Interface(
     ),
     outputs=gr.Textbox(label="AI Analysis Output"),
     title="AI Piano Sound Analyzer 🎹",
-    description="Upload a short piano recording (around 1–3 seconds) to get a predicted piano type and estimated sound-quality score."
 )
 if __name__ == "__main__":

 import gradio as gr
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 import torchaudio
 import numpy as np
 from datasets import load_dataset
 # ---------------------------
+# Constants
+# ---------------------------
+TARGET_SR = 44100
+N_FFT = 1024
+HOP_LENGTH = 512
+N_MELS = 64
+# ---------------------------
+# Load Dataset Metadata for Labels
 # ---------------------------
 dataset = load_dataset("ccmusic-database/pianos", name="8_class")
 label_names = dataset["train"].features["label"].names
+num_classes = len(label_names)
 # ---------------------------
+# Define the Same CNN Model as in Training
 # ---------------------------
+class PianoCNNMultiTask(nn.Module):
+    def __init__(self, num_classes):
+        super().__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 16, kernel_size=3, padding=1),
+            nn.BatchNorm2d(16),
+            nn.ReLU(),
+            nn.MaxPool2d(2),   # 128 -> 64
+            nn.Conv2d(16, 32, kernel_size=3, padding=1),
+            nn.BatchNorm2d(32),
+            nn.ReLU(),
+            nn.MaxPool2d(2),   # 64 -> 32
+            nn.Conv2d(32, 64, kernel_size=3, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.MaxPool2d(2),   # 32 -> 16
+            nn.Conv2d(64, 128, kernel_size=3, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(),
+            nn.AdaptiveAvgPool2d((4, 4))  # 4x4 feature map
+        )
+        self.flatten = nn.Flatten()
+        self.fc_shared = nn.Linear(128 * 4 * 4, 256)
+        self.dropout = nn.Dropout(0.3)
+        # Classification head
+        self.fc_class = nn.Linear(256, num_classes)
+        # Regression head (quality score)
+        self.fc_reg = nn.Linear(256, 1)
+    def forward(self, x):
+        x = self.features(x)
+        x = self.flatten(x)
+        x = F.relu(self.fc_shared(x))
+        x = self.dropout(x)
+        class_logits = self.fc_class(x)
+        quality_pred = self.fc_reg(x).squeeze(1)
+        return class_logits, quality_pred
+# ---------------------------
+# Initialize and Load Trained Model (CPU)
+# ---------------------------
+model = PianoCNNMultiTask(num_classes=num_classes)
+state_dict = torch.load("piano_cnn_multitask.pt", map_location=torch.device("cpu"))
+model.load_state_dict(state_dict)
+model.eval()  # inference mode
 # ---------------------------
 # Audio Preprocessing
 # ---------------------------
 mel_transform = torchaudio.transforms.MelSpectrogram(
     sample_rate=TARGET_SR,
     n_fft=N_FFT,
     center=False  # we will handle padding manually
 )
+def preprocess_audio_to_mel_image(audio):
     """
+    audio from gradio.Audio(type="numpy") is (sample_rate, data)
+    Returns a 3x128x128 tensor ready for the CNN.
     """
     sr, data = audio
+    # Convert to tensor
     waveform = torch.tensor(data, dtype=torch.float32)
     # If shape is (samples,), make it (1, samples)
     if waveform.ndim == 2 and waveform.shape[0] < waveform.shape[1]:
         waveform = waveform.transpose(0, 1)
+    # Convert to mono if stereo
     if waveform.shape[0] > 1:
         waveform = waveform.mean(dim=0, keepdim=True)
     if sr != TARGET_SR:
         resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=TARGET_SR)
         waveform = resampler(waveform)
+    # Ensure minimum length for STFT
+    min_len = N_FFT
+    if waveform.shape[-1] < min_len:
+        pad_amount = min_len - waveform.shape[-1]
         waveform = F.pad(waveform, (0, pad_amount))
+    # Compute Mel-spectrogram and convert to dB
+    mel = mel_transform(waveform)          # [1, n_mels, time]
     mel_db = torchaudio.transforms.AmplitudeToDB()(mel)
+    # Normalize to 0–1
+    mel_db = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-6)
+    # Resize to 128x128 and make 3 channels
+    mel_db = mel_db.unsqueeze(0)  # [1, 1, H, W]
+    mel_resized = F.interpolate(mel_db, size=(128, 128), mode="bilinear", align_corners=False)
+    mel_rgb = mel_resized.repeat(1, 3, 1, 1)  # [1, 3, 128, 128]
+    return mel_rgb.squeeze(0)  # [3, 128, 128]
 # ---------------------------
+# Main Inference Function
 # ---------------------------
 def analyze_piano(audio):
     if audio is None:
+        return "Please upload or record a piano audio clip (around 1–3 seconds)."
     try:
+        # Preprocess input
+        mel_img = preprocess_audio_to_mel_image(audio)  # [3,128,128]
+        mel_batch = mel_img.unsqueeze(0)  # [1,3,128,128]
+        with torch.no_grad():
+            logits, q_pred = model(mel_batch)
+            class_idx = torch.argmax(logits, dim=1).item()
+            quality_score = float(q_pred.item())
+        piano_type = label_names[class_idx]
+        quality_score_rounded = round(quality_score, 2)
         output_text = (
             f"Piano Type Prediction: {piano_type}\n"
+            f"Estimated Sound Quality Score: {quality_score_rounded} / 10"
         )
         return output_text
     except Exception as e:
         return f"An error occurred while processing the audio: {e}"
 # ---------------------------
     ),
     outputs=gr.Textbox(label="AI Analysis Output"),
     title="AI Piano Sound Analyzer 🎹",
+    description="Upload a short piano recording to get a predicted piano type and estimated sound-quality score from the trained CNN model."
 )
 if __name__ == "__main__":