from src.preprocessing.preprocessor import ( single_preprocessing, single_audio_preprocessing, ) from src.spectttra.spectttra_trainer import spectttra_predict from src.llm2vectrain.model import load_llm2vec_model from src.llm2vectrain.llm2vec_trainer import l2vec_single_train, load_pca_model from src.models.mlp import build_mlp, load_config from src.utils.dataset import instance_scaler, audio_instance_scaler import numpy as np import pandas as pd def predict_multimodal(audio_file, lyrics): """ Predict script which includes preprocessing, feature extraction, and training the MLP model for a single data sample. Parameters ---------- audio : audio_object Audio object file lyric : string Lyric string Returns ------- prediction : str A string result of the prediction label : int A numerical representation of the prediction """ # 1.) Instantiate LLM2Vec Model llm2vec_model = load_llm2vec_model() # 2.) Preprocess both audio and lyrics audio, lyrics = single_preprocessing(audio_file, lyrics) # 3.) Call the train method for both models audio_features = spectttra_predict(audio) audio_features = audio_features.reshape(1, -1) lyrics_features = l2vec_single_train(llm2vec_model, lyrics) # 4.) Scale the vectors using Z-Score audio_features, lyrics_features = instance_scaler(audio_features, lyrics_features) # 5.) Reduce the lyrics using saved PCA model reduced_lyrics = load_pca_model(lyrics_features) # 6.) Concatenate the vectors of audio_features + lyrics_features results = np.concatenate([audio_features, reduced_lyrics], axis=1) # ---- Load MLP Classifier ---- config = load_config("config/model_config.yml") classifier = build_mlp(input_dim=results.shape[1], config=config) # 7.) Load trained weights model_path = "models/mlp/mlp_best_multimodal.pth" classifier.load_model(model_path) classifier.model.eval() # 8.) Run prediction confidence, prediction, label, probability = classifier.predict_single( results.flatten() ) return { "confidence": confidence, "prediction": prediction, "label": label, "probability": probability, } def predict_unimodal(audio_file): """ Predict script of AUDIO only which includes preprocessing, feature extraction, and training the MLP model for a single data sample. Parameters ---------- audio : audio_object Audio object file Returns ------- prediction : str A string result of the prediction label : int A numerical representation of the prediction """ # 1.) Preprocess the audio audio = single_audio_preprocessing(audio_file) # 2.) Call the inference method from SpecTTTra audio_features = spectttra_predict(audio) audio_features = audio_features.reshape(1, -1) # 4.) Scale the vector using Z-Score audio_features = audio_instance_scaler(audio_features) # 5.) Load MLP Classifier config = load_config("config/model_config.yml") classifier = build_mlp(input_dim=audio_features.shape[1], config=config) # 6.) Load trained weights model_path = "models/mlp/mlp_best_unimodal.pth" classifier.load_model(model_path) classifier.model.eval() # 8.) Run prediction confidence, prediction, label, probability = classifier.predict_single( audio_features.flatten() ) return { "confidence": confidence, "prediction": prediction, "label": label, "probability": probability, } def predict_combined(audio_file, lyrics): """ Generate both multimodal and audio-only predictions efficiently. Follows the exact same logic as separate functions but reuses audio features. Parameters ---------- audio_file : audio_object Audio object file lyrics : str Lyric string Returns ------- dict Combined results containing both multimodal and audio-only predictions """ import time start_time = time.time() # Load config once config = load_config("config/model_config.yml") # [1] Multimdoal prediction print("[Predict] Running multimodal prediction...") multimodal_start = time.time() # 1.) Load LLM2Vec Model llm2vec_model = load_llm2vec_model() # 2.) Preprocess both audio and lyrics audio_mm, lyrics_mm = single_preprocessing(audio_file, lyrics) # 3.) Extract features audio_features_mm = spectttra_predict(audio_mm) audio_features_mm = audio_features_mm.reshape(1, -1) lyrics_features = l2vec_single_train(llm2vec_model, lyrics_mm) # 4.) Scale the vectors using Z-Score audio_features_mm_scaled, lyrics_features_scaled = instance_scaler( audio_features_mm, lyrics_features ) # 5.) Reduce the lyrics using saved PCA model reduced_lyrics = load_pca_model(lyrics_features_scaled) # 6.) Concatenate the vectors multimodal_features = np.concatenate( [audio_features_mm_scaled, reduced_lyrics], axis=1 ) # Load MLP Classifier multimodal_classifier = build_mlp( input_dim=multimodal_features.shape[1], config=config ) multimodal_classifier.load_model("models/mlp/mlp_best_multimodal.pth") multimodal_classifier.model.eval() # Run prediction mm_confidence, mm_prediction, mm_label, mm_probability = ( multimodal_classifier.predict_single(multimodal_features.flatten()) ) multimodal_time = time.time() - multimodal_start print(f"[Predict] Multimodal prediction completed in {multimodal_time:.2f}s") # [2] Unimodal prediction (audio-only) print("[Predict] Running audio-only prediction...") audio_only_start = time.time() # 1.) Preprocess the audio audio_au = single_audio_preprocessing(audio_file) # 2.) Extract audio features audio_features_au = spectttra_predict(audio_au) audio_features_au = audio_features_au.reshape(1, -1) # 3.) Scale the vector using Z-Score audio_features_au_scaled = audio_instance_scaler(audio_features_au) # Load MLP Classifier audio_classifier = build_mlp( input_dim=audio_features_au_scaled.shape[1], config=config ) audio_classifier.load_model("models/mlp/mlp_best_unimodal.pth") audio_classifier.model.eval() # Run prediction au_confidence, au_prediction, au_label, au_probability = ( audio_classifier.predict_single(audio_features_au_scaled.flatten()) ) audio_only_time = time.time() - audio_only_start print(f"[Predict] Audio-only prediction completed in {audio_only_time:.2f}s") # Summary total_time = time.time() - start_time print("\n[Predict] Combined prediction completed!") print(f"[Predict] Multimodal: {multimodal_time:.2f}s") print(f"[Predict] Audio-only: {audio_only_time:.2f}s") print(f"[Predict] Total: {total_time:.2f}s") return { "multimodal": { "confidence": mm_confidence, "prediction": mm_prediction, "label": mm_label, "probability": mm_probability, }, "audio_only": { "confidence": au_confidence, "prediction": au_prediction, "label": au_label, "probability": au_probability, }, "performance": { "total_time_seconds": total_time, "multimodal_time_seconds": multimodal_time, "audio_only_time_seconds": audio_only_time, }, } if __name__ == "__main__": # Example usage (replace with real inputs, place song inside data/raw.) data = pd.read_csv("data/raw/predict_data_final.csv") result = [] label = [] for row in data.itertuples(): prediction = predict_multimodal(row.song, row.lyrics) result.append( { "song": row.song, "label": row.label, "predicted_label": prediction["label"], "probability": prediction["probability"], } ) for r in result: print(f"Song: {r['song']}") print(f"Actual Label: {r['label']}") print(f"Predicted: {r['predicted_label']}") print(f"Confidence: {r['probability']: .8f}%") print("-" * 50)