Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import ffmpeg | |
| import whisper | |
| import subprocess | |
| import base64 | |
| from pathlib import Path | |
| from typing import Dict, List | |
| import google.generativeai as genai | |
| from config import make_path, GEMINI_API_KEY | |
| from files.utils.logging import get_logger | |
| class AudioAnalyzer: | |
| def __init__(self, video_path: str, gemini_api_key: str = "", model_size: str = 'small'): | |
| self.model_size = model_size | |
| self.video_path = Path(video_path) | |
| self.audio_path = make_path('interim/audio', video_path, 'audio', 'wav') | |
| self.json_out = make_path('processed/audio-analysis', video_path, 'audio_analysis', 'json') | |
| self.logger = get_logger('audio_analysis', f'{self.video_path.stem}_log.txt') | |
| # ✅ Set Gemini key (explicit or from environment) | |
| if gemini_api_key: | |
| genai.configure(api_key=gemini_api_key) | |
| else: | |
| genai.configure(api_key=os.getenv("GEMINI_API_KEY", "")) | |
| self.llm_model = genai.GenerativeModel('gemini-2.5-pro') | |
| def _extract_audio(self) -> None: | |
| self.audio_path.parent.mkdir(parents=True, exist_ok=True) | |
| ( | |
| ffmpeg | |
| .input(str(self.video_path)) | |
| .output(str(self.audio_path), ac=1, ar='16k', format='wav', loglevel='quiet') | |
| .overwrite_output() | |
| .run() | |
| ) | |
| self.logger.info('Audio extracted to %s', self.audio_path) | |
| def _transcribe(self) -> Dict: | |
| model = whisper.load_model(self.model_size) | |
| return model.transcribe(str(self.audio_path), fp16=False) | |
| def _loudness_stats(self, audio_path: Path) -> Dict: | |
| cmd = [ | |
| 'ffmpeg', '-i', str(audio_path), | |
| '-af', 'volumedetect', | |
| '-f', 'null', 'NUL' if os.name == 'nt' else '/dev/null' | |
| ] | |
| result = subprocess.run(cmd, capture_output=True, text=True) | |
| mean = peak = None | |
| for line in result.stderr.splitlines(): | |
| if 'mean_volume:' in line: | |
| mean = float(line.split('mean_volume:')[1].split()[0]) | |
| if 'max_volume:' in line: | |
| peak = float(line.split('max_volume:')[1].split()[0]) | |
| return {'loudness_mean': mean, 'loudness_peak': peak} | |
| def _load_visual_context(self) -> Dict: | |
| """Load nearby frames and brightness values from extracted frame data.""" | |
| frame_json_path = make_path('processed/scene-detection', self.video_path, 'scene', 'json') | |
| frames_dir = make_path('interim/frames', self.video_path, '', '') | |
| if not frame_json_path.exists(): | |
| self.logger.warning("Frame metadata not found: %s", frame_json_path) | |
| return {} | |
| with open(frame_json_path, 'r', encoding='utf-8') as f: | |
| scene_data = json.load(f) | |
| if not scene_data.get('scenes'): | |
| return {} | |
| scene = scene_data['scenes'][0] | |
| mid_time = (float(scene['start_time']) + float(scene['end_time'])) / 2 | |
| scene_idx = 0 | |
| def get_frame_path(tag): | |
| return frames_dir / f"{self.video_path.stem}_scene_{scene_idx:02}{tag}.jpg" | |
| def encode_image(p: Path) -> str: | |
| if p.exists(): | |
| with open(p, 'rb') as f: | |
| return base64.b64encode(f.read()).decode('utf-8') | |
| return "" | |
| return { | |
| 'mid_time': mid_time, | |
| 'frame': encode_image(get_frame_path('')), | |
| 'prev': encode_image(get_frame_path('_prev')), | |
| 'next': encode_image(get_frame_path('_next')), | |
| 'brightness': float(scene.get('brightness', -1.0)) | |
| } | |
| def _gemini_audio_analysis(self, text: str, loudness: Dict, wps: float, visuals: Dict) -> Dict: | |
| """LLM-enhanced audio analysis using audio + first scene frames + metadata""" | |
| prompt = f""" | |
| You are an expert video analyst. Based on the transcript, loudness, speaking pace, | |
| and the first scene's frames (prev, current, next), analyze the audio tone. | |
| Answer in JSON only: | |
| {{ | |
| "tone": "calm|excited|angry|funny|sad|neutral", | |
| "emotion": "joy|sadness|anger|surprise|neutral|mixed", | |
| "pace": "fast|medium|slow", | |
| "delivery_score": 0-100, | |
| "is_hooking_start": true|false, | |
| "comment": "brief summary of audio performance", | |
| "is_dark_artistic": true|false, | |
| "brightness": 0-100 | |
| }} | |
| Transcript: {text} | |
| Loudness: {json.dumps(loudness)} | |
| Words/sec: {wps} | |
| Frame brightness: {visuals.get('brightness')} | |
| """ | |
| # ✅ Properly formatted parts for Gemini multimodal prompt | |
| parts = [{"text": prompt}] | |
| for tag in ['prev', 'frame', 'next']: | |
| img_b64 = visuals.get(tag) | |
| if img_b64: | |
| parts.append({ | |
| "inline_data": { | |
| "mime_type": "image/jpeg", | |
| "data": base64.b64decode(img_b64), | |
| } | |
| }) | |
| try: | |
| response = self.llm_model.generate_content( | |
| contents=[{"role": "user", "parts": parts}], | |
| generation_config={'temperature': 0.3} | |
| ) | |
| text = getattr(response, 'text', '').strip() | |
| cleaned = text.replace('```json', '').replace('```', '') | |
| return json.loads(cleaned) | |
| except Exception as e: | |
| error_msg = str(e) | |
| self.logger.error("LLM call failed: %s", e) | |
| # Check if it's an API key error - if so, raise it to stop the pipeline | |
| if any(keyword in error_msg.lower() for keyword in ["api_key", "invalid", "401", "403", "authentication", "unauthorized"]): | |
| raise ValueError(f"Invalid Gemini API key: {error_msg}") from e | |
| # For other errors, return defaults but log the issue | |
| return { | |
| "tone": "neutral", | |
| "emotion": "neutral", | |
| "pace": "medium", | |
| "delivery_score": 50, | |
| "is_hooking_start": False, | |
| "comment": "LLM analysis failed, using defaults", | |
| "is_dark_artistic": False, | |
| "brightness": visuals.get("brightness", -1.0) | |
| } | |
| def analyze(self) -> Dict: | |
| self._extract_audio() | |
| whisper_res = self._transcribe() | |
| full_text = whisper_res['text'] | |
| duration_s = whisper_res['segments'][-1]['end'] if whisper_res['segments'] else 0 | |
| wps = round(len(full_text.split()) / duration_s, 2) if duration_s else 0 | |
| loudness = self._loudness_stats(self.audio_path) | |
| visual_context = self._load_visual_context() | |
| gemini_analysis = self._gemini_audio_analysis(full_text, loudness, wps, visual_context) | |
| result = { | |
| 'full_transcript': full_text, | |
| 'duration_seconds': duration_s, | |
| 'word_count': len(full_text.split()), | |
| 'words_per_second': wps, | |
| **loudness, | |
| **gemini_analysis | |
| } | |
| self.json_out.parent.mkdir(parents=True, exist_ok=True) | |
| with open(self.json_out, 'w', encoding='utf-8') as f: | |
| json.dump(result, f, indent=2) | |
| self.logger.info('Audio + Visual LLM analysis saved to %s', self.json_out) | |
| return result | |