Spaces:
Sleeping
Sleeping
github-actions[bot]
commited on
Commit
Β·
ad2cb5b
1
Parent(s):
6d2f665
Automated UV deployment
Browse files- =0.1.4 +2 -2
- Dockerfile +1 -1
- files/__init__.py +0 -0
- files/pipeline/__init__.py +0 -0
- files/pipeline/audio_analysis.py +185 -0
- files/pipeline/frame_analysis.py +195 -0
- files/pipeline/frame_extract.py +99 -0
- files/pipeline/scene_detect.py +64 -0
- files/pipeline/scoring.py +202 -0
- files/utils/__init__.py +0 -0
- files/utils/logging.py +19 -0
- main.py +5 -5
- ui/streamlit_app.py +8 -8
=0.1.4
CHANGED
|
@@ -23,9 +23,9 @@ Requirement already satisfied: sniffio>=1.1 in /opt/hostedtoolcache/Python/3.12.
|
|
| 23 |
Requirement already satisfied: click>=8.0.0 in /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages (from typer-slim->huggingface-hub>=0.30) (8.2.1)
|
| 24 |
Downloading huggingface_hub-1.1.2-py3-none-any.whl (514 kB)
|
| 25 |
Downloading hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
|
| 26 |
-
ββββββββββββββββββββββββββββββββββββββββ 3.3/3.3 MB
|
| 27 |
Downloading hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
|
| 28 |
-
ββββββββββββββββββββββββββββββββββββββββ 3.6/3.6 MB
|
| 29 |
Downloading typer_slim-0.20.0-py3-none-any.whl (47 kB)
|
| 30 |
Installing collected packages: typer-slim, hf-xet, hf-transfer, huggingface-hub
|
| 31 |
|
|
|
|
| 23 |
Requirement already satisfied: click>=8.0.0 in /opt/hostedtoolcache/Python/3.12.12/x64/lib/python3.12/site-packages (from typer-slim->huggingface-hub>=0.30) (8.2.1)
|
| 24 |
Downloading huggingface_hub-1.1.2-py3-none-any.whl (514 kB)
|
| 25 |
Downloading hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
|
| 26 |
+
ββββββββββββββββββββββββββββββββββββββββ 3.3/3.3 MB 164.4 MB/s 0:00:00
|
| 27 |
Downloading hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
|
| 28 |
+
ββββββββββββββββββββββββββββββββββββββββ 3.6/3.6 MB 354.7 MB/s 0:00:00
|
| 29 |
Downloading typer_slim-0.20.0-py3-none-any.whl (47 kB)
|
| 30 |
Installing collected packages: typer-slim, hf-xet, hf-transfer, huggingface-hub
|
| 31 |
|
Dockerfile
CHANGED
|
@@ -25,7 +25,7 @@ RUN python -m pip install --upgrade pip setuptools wheel \
|
|
| 25 |
&& pip install --no-cache-dir -r requirements.txt
|
| 26 |
|
| 27 |
# Copy the entire project structure
|
| 28 |
-
COPY
|
| 29 |
COPY ui/ ./ui/
|
| 30 |
COPY config.py ./config.py
|
| 31 |
COPY __init__.py ./
|
|
|
|
| 25 |
&& pip install --no-cache-dir -r requirements.txt
|
| 26 |
|
| 27 |
# Copy the entire project structure
|
| 28 |
+
COPY files/ ./files/
|
| 29 |
COPY ui/ ./ui/
|
| 30 |
COPY config.py ./config.py
|
| 31 |
COPY __init__.py ./
|
files/__init__.py
ADDED
|
File without changes
|
files/pipeline/__init__.py
ADDED
|
File without changes
|
files/pipeline/audio_analysis.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import ffmpeg
|
| 4 |
+
import whisper
|
| 5 |
+
import subprocess
|
| 6 |
+
import base64
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Dict, List
|
| 9 |
+
import google.generativeai as genai
|
| 10 |
+
|
| 11 |
+
from config import make_path, GEMINI_API_KEY
|
| 12 |
+
from files.utils.logging import get_logger
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class AudioAnalyzer:
|
| 16 |
+
def __init__(self, video_path: str, gemini_api_key: str = "", model_size: str = 'small'):
|
| 17 |
+
self.model_size = model_size
|
| 18 |
+
self.video_path = Path(video_path)
|
| 19 |
+
self.audio_path = make_path('interim/audio', video_path, 'audio', 'wav')
|
| 20 |
+
self.json_out = make_path('processed/audio-analysis', video_path, 'audio_analysis', 'json')
|
| 21 |
+
self.logger = get_logger('audio_analysis', f'{self.video_path.stem}_log.txt')
|
| 22 |
+
|
| 23 |
+
# β
Set Gemini key (explicit or from environment)
|
| 24 |
+
if gemini_api_key:
|
| 25 |
+
genai.configure(api_key=gemini_api_key)
|
| 26 |
+
else:
|
| 27 |
+
genai.configure(api_key=os.getenv("GEMINI_API_KEY", ""))
|
| 28 |
+
self.llm_model = genai.GenerativeModel('gemini-2.5-pro')
|
| 29 |
+
|
| 30 |
+
def _extract_audio(self) -> None:
|
| 31 |
+
self.audio_path.parent.mkdir(parents=True, exist_ok=True)
|
| 32 |
+
(
|
| 33 |
+
ffmpeg
|
| 34 |
+
.input(str(self.video_path))
|
| 35 |
+
.output(str(self.audio_path), ac=1, ar='16k', format='wav', loglevel='quiet')
|
| 36 |
+
.overwrite_output()
|
| 37 |
+
.run()
|
| 38 |
+
)
|
| 39 |
+
self.logger.info('Audio extracted to %s', self.audio_path)
|
| 40 |
+
|
| 41 |
+
def _transcribe(self) -> Dict:
|
| 42 |
+
model = whisper.load_model(self.model_size)
|
| 43 |
+
return model.transcribe(str(self.audio_path), fp16=False)
|
| 44 |
+
|
| 45 |
+
def _loudness_stats(self, audio_path: Path) -> Dict:
|
| 46 |
+
cmd = [
|
| 47 |
+
'ffmpeg', '-i', str(audio_path),
|
| 48 |
+
'-af', 'volumedetect',
|
| 49 |
+
'-f', 'null', 'NUL' if os.name == 'nt' else '/dev/null'
|
| 50 |
+
]
|
| 51 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 52 |
+
mean = peak = None
|
| 53 |
+
for line in result.stderr.splitlines():
|
| 54 |
+
if 'mean_volume:' in line:
|
| 55 |
+
mean = float(line.split('mean_volume:')[1].split()[0])
|
| 56 |
+
if 'max_volume:' in line:
|
| 57 |
+
peak = float(line.split('max_volume:')[1].split()[0])
|
| 58 |
+
return {'loudness_mean': mean, 'loudness_peak': peak}
|
| 59 |
+
|
| 60 |
+
def _load_visual_context(self) -> Dict:
|
| 61 |
+
"""Load nearby frames and brightness values from extracted frame data."""
|
| 62 |
+
frame_json_path = make_path('processed/scene-detection', self.video_path, 'scene', 'json')
|
| 63 |
+
frames_dir = make_path('interim/frames', self.video_path, '', '')
|
| 64 |
+
|
| 65 |
+
if not frame_json_path.exists():
|
| 66 |
+
self.logger.warning("Frame metadata not found: %s", frame_json_path)
|
| 67 |
+
return {}
|
| 68 |
+
|
| 69 |
+
with open(frame_json_path, 'r', encoding='utf-8') as f:
|
| 70 |
+
scene_data = json.load(f)
|
| 71 |
+
|
| 72 |
+
if not scene_data.get('scenes'):
|
| 73 |
+
return {}
|
| 74 |
+
|
| 75 |
+
scene = scene_data['scenes'][0]
|
| 76 |
+
mid_time = (float(scene['start_time']) + float(scene['end_time'])) / 2
|
| 77 |
+
scene_idx = 0
|
| 78 |
+
|
| 79 |
+
def get_frame_path(tag):
|
| 80 |
+
return frames_dir / f"{self.video_path.stem}_scene_{scene_idx:02}{tag}.jpg"
|
| 81 |
+
|
| 82 |
+
def encode_image(p: Path) -> str:
|
| 83 |
+
if p.exists():
|
| 84 |
+
with open(p, 'rb') as f:
|
| 85 |
+
return base64.b64encode(f.read()).decode('utf-8')
|
| 86 |
+
return ""
|
| 87 |
+
|
| 88 |
+
return {
|
| 89 |
+
'mid_time': mid_time,
|
| 90 |
+
'frame': encode_image(get_frame_path('')),
|
| 91 |
+
'prev': encode_image(get_frame_path('_prev')),
|
| 92 |
+
'next': encode_image(get_frame_path('_next')),
|
| 93 |
+
'brightness': float(scene.get('brightness', -1.0))
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
def _gemini_audio_analysis(self, text: str, loudness: Dict, wps: float, visuals: Dict) -> Dict:
|
| 97 |
+
"""LLM-enhanced audio analysis using audio + first scene frames + metadata"""
|
| 98 |
+
prompt = f"""
|
| 99 |
+
You are an expert video analyst. Based on the transcript, loudness, speaking pace,
|
| 100 |
+
and the first scene's frames (prev, current, next), analyze the audio tone.
|
| 101 |
+
|
| 102 |
+
Answer in JSON only:
|
| 103 |
+
{{
|
| 104 |
+
"tone": "calm|excited|angry|funny|sad|neutral",
|
| 105 |
+
"emotion": "joy|sadness|anger|surprise|neutral|mixed",
|
| 106 |
+
"pace": "fast|medium|slow",
|
| 107 |
+
"delivery_score": 0-100,
|
| 108 |
+
"is_hooking_start": true|false,
|
| 109 |
+
"comment": "brief summary of audio performance",
|
| 110 |
+
"is_dark_artistic": true|false,
|
| 111 |
+
"brightness": 0-100
|
| 112 |
+
}}
|
| 113 |
+
|
| 114 |
+
Transcript: {text}
|
| 115 |
+
Loudness: {json.dumps(loudness)}
|
| 116 |
+
Words/sec: {wps}
|
| 117 |
+
Frame brightness: {visuals.get('brightness')}
|
| 118 |
+
"""
|
| 119 |
+
|
| 120 |
+
# β
Properly formatted parts for Gemini multimodal prompt
|
| 121 |
+
parts = [{"text": prompt}]
|
| 122 |
+
for tag in ['prev', 'frame', 'next']:
|
| 123 |
+
img_b64 = visuals.get(tag)
|
| 124 |
+
if img_b64:
|
| 125 |
+
parts.append({
|
| 126 |
+
"inline_data": {
|
| 127 |
+
"mime_type": "image/jpeg",
|
| 128 |
+
"data": base64.b64decode(img_b64),
|
| 129 |
+
}
|
| 130 |
+
})
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
response = self.llm_model.generate_content(
|
| 134 |
+
contents=[{"role": "user", "parts": parts}],
|
| 135 |
+
generation_config={'temperature': 0.3}
|
| 136 |
+
)
|
| 137 |
+
text = getattr(response, 'text', '').strip()
|
| 138 |
+
cleaned = text.replace('```json', '').replace('```', '')
|
| 139 |
+
return json.loads(cleaned)
|
| 140 |
+
except Exception as e:
|
| 141 |
+
error_msg = str(e)
|
| 142 |
+
self.logger.error("LLM call failed: %s", e)
|
| 143 |
+
|
| 144 |
+
# Check if it's an API key error - if so, raise it to stop the pipeline
|
| 145 |
+
if any(keyword in error_msg.lower() for keyword in ["api_key", "invalid", "401", "403", "authentication", "unauthorized"]):
|
| 146 |
+
raise ValueError(f"Invalid Gemini API key: {error_msg}") from e
|
| 147 |
+
|
| 148 |
+
# For other errors, return defaults but log the issue
|
| 149 |
+
return {
|
| 150 |
+
"tone": "neutral",
|
| 151 |
+
"emotion": "neutral",
|
| 152 |
+
"pace": "medium",
|
| 153 |
+
"delivery_score": 50,
|
| 154 |
+
"is_hooking_start": False,
|
| 155 |
+
"comment": "LLM analysis failed, using defaults",
|
| 156 |
+
"is_dark_artistic": False,
|
| 157 |
+
"brightness": visuals.get("brightness", -1.0)
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
def analyze(self) -> Dict:
|
| 161 |
+
self._extract_audio()
|
| 162 |
+
whisper_res = self._transcribe()
|
| 163 |
+
full_text = whisper_res['text']
|
| 164 |
+
duration_s = whisper_res['segments'][-1]['end'] if whisper_res['segments'] else 0
|
| 165 |
+
wps = round(len(full_text.split()) / duration_s, 2) if duration_s else 0
|
| 166 |
+
|
| 167 |
+
loudness = self._loudness_stats(self.audio_path)
|
| 168 |
+
visual_context = self._load_visual_context()
|
| 169 |
+
gemini_analysis = self._gemini_audio_analysis(full_text, loudness, wps, visual_context)
|
| 170 |
+
|
| 171 |
+
result = {
|
| 172 |
+
'full_transcript': full_text,
|
| 173 |
+
'duration_seconds': duration_s,
|
| 174 |
+
'word_count': len(full_text.split()),
|
| 175 |
+
'words_per_second': wps,
|
| 176 |
+
**loudness,
|
| 177 |
+
**gemini_analysis
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
self.json_out.parent.mkdir(parents=True, exist_ok=True)
|
| 181 |
+
with open(self.json_out, 'w', encoding='utf-8') as f:
|
| 182 |
+
json.dump(result, f, indent=2)
|
| 183 |
+
|
| 184 |
+
self.logger.info('Audio + Visual LLM analysis saved to %s', self.json_out)
|
| 185 |
+
return result
|
files/pipeline/frame_analysis.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
import base64
|
| 5 |
+
import openai
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import google.generativeai as genai
|
| 8 |
+
from files.utils.logging import get_logger
|
| 9 |
+
from config import make_path, OPENAI_API_KEY, GEMINI_API_KEY, DATA_DIR
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class FrameAnalyzer:
|
| 13 |
+
def __init__(self, video_path: str, openai_api_key: str = "", save_dir: str = 'processed/frame-analysis'):
|
| 14 |
+
# β
Set OpenAI key (explicit or from environment)
|
| 15 |
+
|
| 16 |
+
# print(openai_api_key)
|
| 17 |
+
|
| 18 |
+
if openai_api_key:
|
| 19 |
+
openai.api_key = openai_api_key
|
| 20 |
+
else:
|
| 21 |
+
import os
|
| 22 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
| 23 |
+
|
| 24 |
+
self.video_path = Path(video_path)
|
| 25 |
+
self.frames_dir = DATA_DIR / 'interim' / 'frames' / f'{self.video_path.stem}_'
|
| 26 |
+
self.save_path = make_path(save_dir, video_path, 'frame_analysis', 'json')
|
| 27 |
+
self.save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 28 |
+
|
| 29 |
+
log_file = f'{self.video_path.stem}_log.txt'
|
| 30 |
+
self.logger = get_logger('frame_analysis', log_file)
|
| 31 |
+
|
| 32 |
+
@staticmethod
|
| 33 |
+
def encode_image(path: Path) -> str:
|
| 34 |
+
with open(path, 'rb') as f:
|
| 35 |
+
return base64.b64encode(f.read()).decode('utf-8')
|
| 36 |
+
|
| 37 |
+
@staticmethod
|
| 38 |
+
def extract_json(text: str) -> dict:
|
| 39 |
+
try:
|
| 40 |
+
return json.loads(text)
|
| 41 |
+
except json.JSONDecodeError:
|
| 42 |
+
pass
|
| 43 |
+
|
| 44 |
+
match = re.search(r'```json\s*(\{.*?\})\s*```', text, re.DOTALL)
|
| 45 |
+
if match:
|
| 46 |
+
return json.loads(match.group(1))
|
| 47 |
+
|
| 48 |
+
match = re.search(r'(\{.*?\})', text, re.DOTALL)
|
| 49 |
+
if match:
|
| 50 |
+
return json.loads(match.group(1))
|
| 51 |
+
|
| 52 |
+
raise ValueError('No valid JSON found in GPT response')
|
| 53 |
+
|
| 54 |
+
def gpt_analyze(self, frame_path: Path, prev_path: Path, next_path: Path) -> dict:
|
| 55 |
+
prompt = """
|
| 56 |
+
You are an expert video content strategist. Analyze this video frame and surrounding context.
|
| 57 |
+
Determine if the lighting is poor or intentionally low for creative reasons.
|
| 58 |
+
|
| 59 |
+
Output JSON only:
|
| 60 |
+
{
|
| 61 |
+
lighting: 0-100,
|
| 62 |
+
is_artistic_dark: true|false,
|
| 63 |
+
composition: 0-100,
|
| 64 |
+
has_text: true|false,
|
| 65 |
+
text: "string",
|
| 66 |
+
hook_strength: 0-100
|
| 67 |
+
}
|
| 68 |
+
"""
|
| 69 |
+
|
| 70 |
+
images = [
|
| 71 |
+
{'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{self.encode_image(p)}'}}
|
| 72 |
+
for p in [prev_path, frame_path, next_path] if p.exists()
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
response = openai.chat.completions.create(
|
| 76 |
+
model='gpt-4o-mini',
|
| 77 |
+
messages=[
|
| 78 |
+
{'role': 'user', 'content': [{'type': 'text', 'text': prompt}] + images}
|
| 79 |
+
],
|
| 80 |
+
temperature=0.2,
|
| 81 |
+
max_tokens=400,
|
| 82 |
+
)
|
| 83 |
+
return self.extract_json(response.choices[0].message.content)
|
| 84 |
+
|
| 85 |
+
def analyze(self) -> dict:
|
| 86 |
+
results = {}
|
| 87 |
+
all_frames = sorted(self.frames_dir.glob('*_scene_*.jpg'))
|
| 88 |
+
center_frames = [f for f in all_frames if '_prev' not in f.name and '_next' not in f.name]
|
| 89 |
+
|
| 90 |
+
for frame in center_frames:
|
| 91 |
+
prev = frame.with_name(frame.name.replace('.jpg', '_prev.jpg'))
|
| 92 |
+
next_ = frame.with_name(frame.name.replace('.jpg', '_next.jpg'))
|
| 93 |
+
|
| 94 |
+
self.logger.info('Analyzing frame: %s', frame.name)
|
| 95 |
+
try:
|
| 96 |
+
result = self.gpt_analyze(frame, prev, next_)
|
| 97 |
+
results[frame.name] = result
|
| 98 |
+
except Exception as e:
|
| 99 |
+
self.logger.error('LLM analysis failed on %s: %s', frame.name, e)
|
| 100 |
+
results[frame.name] = {'error': str(e)}
|
| 101 |
+
|
| 102 |
+
with open(self.save_path, 'w', encoding='utf-8') as f:
|
| 103 |
+
json.dump(results, f, indent=2)
|
| 104 |
+
|
| 105 |
+
self.logger.info('Frame analysis saved to %s', self.save_path)
|
| 106 |
+
return results
|
| 107 |
+
|
| 108 |
+
class HookAnalyzer:
|
| 109 |
+
def __init__(self, video_path: str, gemini_api_key: str = ""):
|
| 110 |
+
self.video_path = Path(video_path)
|
| 111 |
+
self.frames_dir = Path('data/interim/frames') / f'{self.video_path.stem}_'
|
| 112 |
+
self.audio_json = make_path('processed/audio-analysis', video_path, 'audio_analysis', 'json')
|
| 113 |
+
self.output_json = make_path('processed/hook-analysis', video_path, 'hook_analysis', 'json')
|
| 114 |
+
self.logger = get_logger('hook_analysis', f'{self.video_path.stem}_log.txt')
|
| 115 |
+
|
| 116 |
+
# β
Set Gemini key (explicit or from environment)
|
| 117 |
+
if gemini_api_key:
|
| 118 |
+
genai.configure(api_key=gemini_api_key)
|
| 119 |
+
else:
|
| 120 |
+
genai.configure(api_key=os.getenv("GEMINI_API_KEY", ""))
|
| 121 |
+
self.model = genai.GenerativeModel('gemini-2.5-pro')
|
| 122 |
+
|
| 123 |
+
def _encode_image(self, path: Path) -> bytes:
|
| 124 |
+
with open(path, 'rb') as f:
|
| 125 |
+
return f.read()
|
| 126 |
+
|
| 127 |
+
def _load_audio_summary(self) -> dict:
|
| 128 |
+
with open(self.audio_json, 'r', encoding='utf-8') as f:
|
| 129 |
+
return json.load(f)
|
| 130 |
+
|
| 131 |
+
def _gemini_hook_alignment(self, audio_summary: dict, frames: list[Path]) -> dict:
|
| 132 |
+
parts = [{'mime_type': 'image/jpeg', 'data': self._encode_image(f)} for f in frames if f.exists()]
|
| 133 |
+
text = f"""You are a virality analyst. Analyze the opening visuals and tone:
|
| 134 |
+
- Does the audio mood match the expressions and visuals?
|
| 135 |
+
- Are viewers likely to be hooked in the first few seconds?
|
| 136 |
+
|
| 137 |
+
Audio Summary: {json.dumps(audio_summary)}
|
| 138 |
+
|
| 139 |
+
Give JSON only:
|
| 140 |
+
{{
|
| 141 |
+
"hook_alignment_score": 0-100,
|
| 142 |
+
"facial_sync": "good|ok|poor|none",
|
| 143 |
+
"comment": "short summary"
|
| 144 |
+
}}"""
|
| 145 |
+
|
| 146 |
+
try:
|
| 147 |
+
response = self.model.generate_content([text] + parts)
|
| 148 |
+
raw_text = getattr(response, 'text', '').strip()
|
| 149 |
+
self.logger.debug("Gemini raw response: %s", raw_text)
|
| 150 |
+
if not raw_text:
|
| 151 |
+
raise ValueError("Gemini response was empty.")
|
| 152 |
+
|
| 153 |
+
raw_text = (
|
| 154 |
+
raw_text
|
| 155 |
+
.replace('```json\n', '')
|
| 156 |
+
.replace('\n```', '')
|
| 157 |
+
.replace('```json', '')
|
| 158 |
+
.replace('```', '')
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
return json.loads(raw_text)
|
| 162 |
+
except json.JSONDecodeError as e:
|
| 163 |
+
self.logger.error("β Failed to parse Gemini response as JSON: %s", e)
|
| 164 |
+
self.logger.debug("Gemini response was: %r", getattr(response, 'text', '<<NO TEXT>>'))
|
| 165 |
+
return {
|
| 166 |
+
"hook_alignment_score": -1,
|
| 167 |
+
"facial_sync": "none",
|
| 168 |
+
"comment": "Invalid JSON response from Gemini"
|
| 169 |
+
}
|
| 170 |
+
except Exception as e:
|
| 171 |
+
error_msg = str(e)
|
| 172 |
+
self.logger.error("β Gemini API call failed: %s", e)
|
| 173 |
+
|
| 174 |
+
# Check if it's an API key error - if so, raise it to stop the pipeline
|
| 175 |
+
if any(keyword in error_msg.lower() for keyword in ["api_key", "invalid", "401", "403", "authentication", "unauthorized"]):
|
| 176 |
+
raise ValueError(f"Invalid Gemini API key: {error_msg}") from e
|
| 177 |
+
|
| 178 |
+
# For other errors, return defaults
|
| 179 |
+
return {
|
| 180 |
+
"hook_alignment_score": -1,
|
| 181 |
+
"facial_sync": "none",
|
| 182 |
+
"comment": f"Gemini API error: {error_msg}"
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
def analyze(self) -> dict:
|
| 186 |
+
audio_summary = self._load_audio_summary()
|
| 187 |
+
frames = sorted(self.frames_dir.glob('*_scene_*.jpg'))[:3]
|
| 188 |
+
result = self._gemini_hook_alignment(audio_summary, frames)
|
| 189 |
+
|
| 190 |
+
self.output_json.parent.mkdir(parents=True, exist_ok=True)
|
| 191 |
+
with open(self.output_json, 'w', encoding='utf-8') as f:
|
| 192 |
+
json.dump(result, f, indent=2)
|
| 193 |
+
|
| 194 |
+
self.logger.info('Hook analysis saved to %s', self.output_json)
|
| 195 |
+
return result
|
files/pipeline/frame_extract.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import subprocess
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from config import make_path
|
| 5 |
+
from files.utils.logging import get_logger
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class FrameExtractor:
|
| 9 |
+
def __init__(self, video_path: str, min_scene_len: float = 0.2):
|
| 10 |
+
self.min_scene_len = min_scene_len
|
| 11 |
+
self.video_path = Path(video_path)
|
| 12 |
+
self.scene_json_path = self.frame_json = make_path('processed/scene-detection', video_path, 'scene', 'json')
|
| 13 |
+
self.output_dir = make_path('interim/frames', video_path, '', '')
|
| 14 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 15 |
+
|
| 16 |
+
log_file = f'{self.video_path.stem}_log.txt'
|
| 17 |
+
self.logger = get_logger('frame_extract', log_file)
|
| 18 |
+
|
| 19 |
+
def _ffmpeg_extract(self, timestamp: float, out_path: Path):
|
| 20 |
+
cmd = [
|
| 21 |
+
'ffmpeg',
|
| 22 |
+
'-loglevel', 'error',
|
| 23 |
+
'-y',
|
| 24 |
+
'-ss', f'{timestamp:.3f}',
|
| 25 |
+
'-t', '1',
|
| 26 |
+
'-i', str(self.video_path),
|
| 27 |
+
'-frames:v', '1',
|
| 28 |
+
'-q:v', '2',
|
| 29 |
+
'-pix_fmt', 'yuvj420p',
|
| 30 |
+
str(out_path)
|
| 31 |
+
]
|
| 32 |
+
result = subprocess.run(cmd, capture_output=True)
|
| 33 |
+
if result.returncode != 0:
|
| 34 |
+
self.logger.error('ffmpeg failed: %s', result.stderr.decode('utf-8', 'ignore').strip())
|
| 35 |
+
|
| 36 |
+
def _get_brightness(self, timestamp: float) -> float:
|
| 37 |
+
cmd = [
|
| 38 |
+
'ffprobe',
|
| 39 |
+
'-v', 'error',
|
| 40 |
+
'-read_intervals', f'%{timestamp}+1',
|
| 41 |
+
'-select_streams', 'v:0',
|
| 42 |
+
'-show_frames',
|
| 43 |
+
'-show_entries', 'frame_tags=lavfi.signalstats.YAVG',
|
| 44 |
+
'-of', 'default=noprint_wrappers=1:nokey=1',
|
| 45 |
+
str(self.video_path)
|
| 46 |
+
]
|
| 47 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 48 |
+
try:
|
| 49 |
+
yavg_values = [float(line.strip()) for line in result.stdout.strip().split('\n') if line.strip()]
|
| 50 |
+
if yavg_values:
|
| 51 |
+
return yavg_values[0]
|
| 52 |
+
except Exception:
|
| 53 |
+
pass
|
| 54 |
+
self.logger.warning('Could not get brightness at %.2fs', timestamp)
|
| 55 |
+
return -1.0
|
| 56 |
+
|
| 57 |
+
def extract(self) -> list[dict]:
|
| 58 |
+
with open(self.scene_json_path, encoding='utf-8') as f:
|
| 59 |
+
scenes = json.load(f).get('scenes', [])
|
| 60 |
+
if not scenes:
|
| 61 |
+
self.logger.warning('No scenes found in %s', self.scene_json_path)
|
| 62 |
+
return []
|
| 63 |
+
|
| 64 |
+
delta = 0.5
|
| 65 |
+
results = []
|
| 66 |
+
|
| 67 |
+
for i, sc in enumerate(scenes):
|
| 68 |
+
start = float(sc['start_time'])
|
| 69 |
+
end = float(sc['end_time'])
|
| 70 |
+
dur = end - start
|
| 71 |
+
if dur < self.min_scene_len:
|
| 72 |
+
self.logger.warning('Scene %s too short (%.2fs), skipping', i, dur)
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
mid = (start + end) / 2
|
| 76 |
+
|
| 77 |
+
frame_path = self.output_dir / f'{self.video_path.stem}_scene_{i:02}.jpg'
|
| 78 |
+
prev_path = self.output_dir / f'{self.video_path.stem}_scene_{i:02}_prev.jpg'
|
| 79 |
+
next_path = self.output_dir / f'{self.video_path.stem}_scene_{i:02}_next.jpg'
|
| 80 |
+
|
| 81 |
+
self._ffmpeg_extract(mid, frame_path)
|
| 82 |
+
self._ffmpeg_extract(mid - delta, prev_path)
|
| 83 |
+
self._ffmpeg_extract(mid + delta, next_path)
|
| 84 |
+
|
| 85 |
+
brightness = self._get_brightness(mid)
|
| 86 |
+
|
| 87 |
+
self.logger.info('[Scene %s] %.2fs β %s | Brightness: %.2f', i, mid, frame_path.name, brightness)
|
| 88 |
+
|
| 89 |
+
results.append({
|
| 90 |
+
'scene_index': i,
|
| 91 |
+
'timestamp': mid,
|
| 92 |
+
'frame_path': str(frame_path),
|
| 93 |
+
'prev_frame_path': str(prev_path),
|
| 94 |
+
'next_frame_path': str(next_path),
|
| 95 |
+
'brightness': brightness
|
| 96 |
+
})
|
| 97 |
+
|
| 98 |
+
self.logger.info('%s frames (with context) extracted to %s', len(results), self.output_dir)
|
| 99 |
+
return results
|
files/pipeline/scene_detect.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from scenedetect import VideoManager, SceneManager
|
| 5 |
+
from scenedetect.detectors import ContentDetector
|
| 6 |
+
from files.utils.logging import get_logger
|
| 7 |
+
from config import make_path
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class SceneDetector:
|
| 11 |
+
def __init__(self, video_path: str, backend='base', return_scenes=False,
|
| 12 |
+
min_scene_duration=0.1, threshold=30.0, transition_merge_gap=0.1):
|
| 13 |
+
self.video_path = video_path
|
| 14 |
+
self.backend = backend
|
| 15 |
+
self.return_scenes = return_scenes
|
| 16 |
+
self.min_scene_duration = min_scene_duration
|
| 17 |
+
self.threshold = threshold
|
| 18 |
+
self.transition_merge_gap = transition_merge_gap
|
| 19 |
+
|
| 20 |
+
log_filename = f'{Path(video_path).stem}_log.txt'
|
| 21 |
+
self.logger = get_logger(name='scene_detect', log_file=log_filename)
|
| 22 |
+
|
| 23 |
+
def detect(self, start_time: float = 0, end_time: float = -1) -> list:
|
| 24 |
+
try:
|
| 25 |
+
self.logger.info(f'Detecting scenes for: {self.video_path}')
|
| 26 |
+
|
| 27 |
+
video_manager = VideoManager([self.video_path])
|
| 28 |
+
scene_manager = SceneManager()
|
| 29 |
+
scene_manager.add_detector(ContentDetector(threshold=self.threshold))
|
| 30 |
+
|
| 31 |
+
video_manager.set_downscale_factor()
|
| 32 |
+
video_manager.start()
|
| 33 |
+
scene_manager.detect_scenes(frame_source=video_manager)
|
| 34 |
+
scene_list = scene_manager.get_scene_list()
|
| 35 |
+
|
| 36 |
+
# Format output to match Sieve style
|
| 37 |
+
scenes = []
|
| 38 |
+
for start, end in scene_list:
|
| 39 |
+
scenes.append({
|
| 40 |
+
"start": round(start.get_seconds(), 2),
|
| 41 |
+
"end": round(end.get_seconds(), 2)
|
| 42 |
+
})
|
| 43 |
+
|
| 44 |
+
self.logger.info(f"{len(scenes)} scenes detected.")
|
| 45 |
+
return [{"scenes": scenes}]
|
| 46 |
+
|
| 47 |
+
except Exception as e:
|
| 48 |
+
self.logger.error(f'Scene detection failed: {e}')
|
| 49 |
+
return []
|
| 50 |
+
|
| 51 |
+
def detect_and_save(self) -> list:
|
| 52 |
+
scenes = self.detect()
|
| 53 |
+
if not scenes:
|
| 54 |
+
self.logger.warning('No scenes detected. Skipping save.')
|
| 55 |
+
return []
|
| 56 |
+
|
| 57 |
+
out_path = make_path('processed/scene-detection', self.video_path, 'scene', 'json')
|
| 58 |
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
| 59 |
+
|
| 60 |
+
with open(out_path, 'w', encoding='utf-8') as f:
|
| 61 |
+
json.dump({'scenes': scenes[0]['scenes']}, f, indent=2)
|
| 62 |
+
|
| 63 |
+
self.logger.info(f'Scene data saved to: {out_path}')
|
| 64 |
+
return scenes
|
files/pipeline/scoring.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import openai
|
| 2 |
+
import json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from files.utils.logging import get_logger
|
| 5 |
+
from config import make_path, OPENAI_API_KEY
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class VideoReport:
|
| 9 |
+
def __init__(self, video_path: str, openai_api_key: str = ""):
|
| 10 |
+
# β
Set OpenAI key (explicit or from environment)
|
| 11 |
+
if openai_api_key:
|
| 12 |
+
openai.api_key = openai_api_key
|
| 13 |
+
else:
|
| 14 |
+
import os
|
| 15 |
+
openai.api_key = os.getenv("OPENAI_API_KEY", "")
|
| 16 |
+
self.video_path = Path(video_path)
|
| 17 |
+
self.audio_json = make_path('processed/audio-analysis', video_path, 'audio_analysis', 'json')
|
| 18 |
+
self.frame_json = make_path('processed/frame-analysis', video_path, 'frame_analysis', 'json')
|
| 19 |
+
self.hook_json = make_path('processed/hook-analysis', video_path, 'hook_analysis', 'json')
|
| 20 |
+
self.output_json = make_path('reports', video_path, 'final_report', 'json')
|
| 21 |
+
|
| 22 |
+
log_filename = f'{self.video_path.stem}_log.txt'
|
| 23 |
+
self.logger = get_logger(name='video_report', log_file=log_filename)
|
| 24 |
+
|
| 25 |
+
self.audio_analysis = self.load_json(self.audio_json)
|
| 26 |
+
self.frame_analysis = self.load_json(self.frame_json)
|
| 27 |
+
self.hook_analysis = self.load_json(self.hook_json)
|
| 28 |
+
|
| 29 |
+
def load_json(self, path: Path):
|
| 30 |
+
try:
|
| 31 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 32 |
+
return json.load(f)
|
| 33 |
+
except Exception:
|
| 34 |
+
return {}
|
| 35 |
+
|
| 36 |
+
def extract_matrices(self):
|
| 37 |
+
return {
|
| 38 |
+
"tone": self.audio_analysis.get("tone", "unknown"),
|
| 39 |
+
"emotion": self.audio_analysis.get("emotion", "unknown"),
|
| 40 |
+
"pace": self.audio_analysis.get("pace", "unknown"),
|
| 41 |
+
"facial_sync": self.hook_analysis.get("facial_sync", "unknown")
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
def prepare_prompt(self) -> str:
|
| 45 |
+
prompt_sections = []
|
| 46 |
+
prompt_sections.append(f"""
|
| 47 |
+
You are an expert evaluator trained to assess the **virality potential and content quality** of short-form video ads (e.g., TikToks, Reels). You are provided with:
|
| 48 |
+
|
| 49 |
+
- A sequence of scene-selected **frames**
|
| 50 |
+
- A full **audio transcription**
|
| 51 |
+
- Detailed **audio statistics**
|
| 52 |
+
- And other meta-data of videos
|
| 53 |
+
|
| 54 |
+
Your task is to analyze the video and assign the **five scores** with weighted importance. Follow the criteria and format strictly.
|
| 55 |
+
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
### π― Scores to Judge (Each 0β100)
|
| 59 |
+
|
| 60 |
+
You must evaluate the following sub-categories:
|
| 61 |
+
|
| 62 |
+
- `hook`: Does the video grab attention in the first 3 seconds? A good hook is **surprising, emotional, funny, or visually intense**. A poor hook is **slow, random, or bland**.
|
| 63 |
+
|
| 64 |
+
- `visuals`: Are visuals high-resolution, diverse, and relevant to the message? Good visuals are **intentional and professionally framed**. Poor visuals are **static, noisy, or irrelevant**.
|
| 65 |
+
|
| 66 |
+
- `audio`: Is the audio clean, engaging, and well-synced? Quality audio has **clarity, proper levels, and supports the visuals**. Poor audio is **distracting, flat, or off-sync**.
|
| 67 |
+
|
| 68 |
+
- `engagement`: Does the video maintain interest? Strong pacing, emotional depth, or thought-provoking content improves this. Weak pacing or meaningless content hurts it.
|
| 69 |
+
|
| 70 |
+
- `visual_diversity`: Does the video use **multiple camera angles, transitions, or visual styles**? A lack of variation makes it feel stale.
|
| 71 |
+
|
| 72 |
+
---
|
| 73 |
+
|
| 74 |
+
### π Scoring Enforcement Guidelines
|
| 75 |
+
|
| 76 |
+
- Be **strict**: Low-effort content should fall well below 50
|
| 77 |
+
- Be **realistic**: Reward polish, creativity, clarity, and emotional impact
|
| 78 |
+
- Only videos with **clear intent and great execution** should reach 80+
|
| 79 |
+
- Penalize poor hooks, bland visuals, unclear audio, or meaningless structure
|
| 80 |
+
- Ensure your scores reflect meaningful differences between videos β **don't cluster everything around 60**
|
| 81 |
+
|
| 82 |
+
---
|
| 83 |
+
""")
|
| 84 |
+
|
| 85 |
+
if self.audio_analysis:
|
| 86 |
+
prompt_sections.append("Audio Analysis:\n" + json.dumps(self.audio_analysis, indent=2))
|
| 87 |
+
if self.frame_analysis:
|
| 88 |
+
prompt_sections.append("\nFrame Analysis:\n" + json.dumps(self.frame_analysis, indent=2))
|
| 89 |
+
if self.hook_analysis:
|
| 90 |
+
prompt_sections.append("\nHook Alignment Analysis:\n" + json.dumps(self.hook_analysis, indent=2))
|
| 91 |
+
|
| 92 |
+
matrices = self.extract_matrices()
|
| 93 |
+
prompt_sections.append("\nHere are extracted behavioral/performance matrices:\n" + json.dumps(matrices, indent=2))
|
| 94 |
+
|
| 95 |
+
prompt_sections.append(f"""
|
| 96 |
+
### π€ Output Format (JSON Only β No Comments or Explanations):
|
| 97 |
+
{{
|
| 98 |
+
"video_name": "{self.video_path.stem}",
|
| 99 |
+
"scores": {{
|
| 100 |
+
"hook": 0,
|
| 101 |
+
"visuals": 0,
|
| 102 |
+
"audio": 0,
|
| 103 |
+
"engagement": 0,
|
| 104 |
+
"visual_diversity": 0
|
| 105 |
+
}},
|
| 106 |
+
"matrices": {{
|
| 107 |
+
"tone": "",
|
| 108 |
+
"emotion": "",
|
| 109 |
+
"pace": "",
|
| 110 |
+
"facial_sync": ""
|
| 111 |
+
}},
|
| 112 |
+
"summary": "",
|
| 113 |
+
"suggestions": [
|
| 114 |
+
"Specific improvement 1",
|
| 115 |
+
"Specific improvement 2",
|
| 116 |
+
"Specific improvement 3",
|
| 117 |
+
... more if required
|
| 118 |
+
]
|
| 119 |
+
}}
|
| 120 |
+
""")
|
| 121 |
+
return "\n".join(prompt_sections)
|
| 122 |
+
|
| 123 |
+
def query_llm(self, prompt: str) -> dict:
|
| 124 |
+
try:
|
| 125 |
+
response = openai.chat.completions.create(
|
| 126 |
+
model='gpt-4o',
|
| 127 |
+
messages=[
|
| 128 |
+
{"role": "system", "content": "You are a professional short-video quality evaluator."},
|
| 129 |
+
{"role": "user", "content": prompt}
|
| 130 |
+
],
|
| 131 |
+
temperature=0.4,
|
| 132 |
+
)
|
| 133 |
+
reply = response.choices[0].message.content.strip()
|
| 134 |
+
cleaned = reply.replace('```json', '').replace('```', '')
|
| 135 |
+
result = json.loads(cleaned)
|
| 136 |
+
return result
|
| 137 |
+
except Exception as e:
|
| 138 |
+
self.logger.error(f"LLM generation failed: {e}")
|
| 139 |
+
return {
|
| 140 |
+
"scores": {
|
| 141 |
+
"hook": 0,
|
| 142 |
+
"visuals": 0,
|
| 143 |
+
"audio": 0,
|
| 144 |
+
"engagement": 0,
|
| 145 |
+
"visual_diversity": 0
|
| 146 |
+
},
|
| 147 |
+
"matrices": self.extract_matrices(),
|
| 148 |
+
"summary": "Failed to generate report.",
|
| 149 |
+
"suggestions": ["Try again", "Check input files", "Verify OpenAI key"]
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
def compute_virality_score(self, result):
|
| 153 |
+
weights = {
|
| 154 |
+
'hook': 0.18,
|
| 155 |
+
'visuals': 0.20,
|
| 156 |
+
'audio': 0.25,
|
| 157 |
+
'engagement': 0.27,
|
| 158 |
+
'visual_diversity': 0.10
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
sub_scores = result["scores"]
|
| 162 |
+
base_score = sum(sub_scores[key] * weights[key] for key in weights)
|
| 163 |
+
|
| 164 |
+
bonus = 0
|
| 165 |
+
matrices = result.get("matrices", {})
|
| 166 |
+
|
| 167 |
+
if matrices.get("emotion") in ["joy", "inspiration"]:
|
| 168 |
+
bonus += 6
|
| 169 |
+
if matrices.get("tone") in ["funny", "relatable"]:
|
| 170 |
+
bonus += 6
|
| 171 |
+
if matrices.get("facial_sync") in ["ok", "good"]:
|
| 172 |
+
bonus += 4
|
| 173 |
+
|
| 174 |
+
if sub_scores.get("hook", 0) <= 30:
|
| 175 |
+
bonus -= 6
|
| 176 |
+
if sub_scores.get("audio", 0) < 40:
|
| 177 |
+
bonus -= 5
|
| 178 |
+
if matrices.get("facial_sync") == "none":
|
| 179 |
+
bonus -= 5
|
| 180 |
+
|
| 181 |
+
final_score = max(0, min(100, int(base_score + bonus)))
|
| 182 |
+
return final_score
|
| 183 |
+
|
| 184 |
+
def generate(self) -> dict:
|
| 185 |
+
self.logger.info("Preparing prompt for LLM...")
|
| 186 |
+
prompt = self.prepare_prompt()
|
| 187 |
+
|
| 188 |
+
self.logger.info("Querying LLM for report generation...")
|
| 189 |
+
result = self.query_llm(prompt)
|
| 190 |
+
total_score = self.compute_virality_score(result)
|
| 191 |
+
final_output = {
|
| 192 |
+
"video_name": self.video_path.stem,
|
| 193 |
+
"total_score": total_score,
|
| 194 |
+
**result
|
| 195 |
+
}
|
| 196 |
+
self.logger.info("Saving final report...")
|
| 197 |
+
self.output_json.parent.mkdir(parents=True, exist_ok=True)
|
| 198 |
+
with open(self.output_json, 'w', encoding='utf-8') as f:
|
| 199 |
+
json.dump(final_output, f, indent=2)
|
| 200 |
+
|
| 201 |
+
self.logger.info("Report successfully generated at %s", self.output_json)
|
| 202 |
+
return final_output
|
files/utils/__init__.py
ADDED
|
File without changes
|
files/utils/logging.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from config import LOG_DIR
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def get_logger(name='vc', log_file: str = 'latest.log', level='INFO'):
|
| 7 |
+
Path(LOG_DIR).mkdir(exist_ok=True)
|
| 8 |
+
log_path = LOG_DIR / log_file
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(name)
|
| 11 |
+
logger.setLevel(level.upper())
|
| 12 |
+
|
| 13 |
+
if not logger.handlers:
|
| 14 |
+
handler = logging.FileHandler(log_path, encoding='utf-8')
|
| 15 |
+
formatter = logging.Formatter('%(asctime)s | %(levelname)-7s | %(name)s | %(message)s')
|
| 16 |
+
handler.setFormatter(formatter)
|
| 17 |
+
logger.addHandler(handler)
|
| 18 |
+
|
| 19 |
+
return logger
|
main.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
from pathlib import Path
|
| 2 |
-
from
|
| 3 |
-
from
|
| 4 |
-
from
|
| 5 |
-
from
|
| 6 |
-
from
|
| 7 |
|
| 8 |
|
| 9 |
def run_pipeline(video_path: str):
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
+
from files.pipeline.scoring import VideoReport
|
| 3 |
+
from files.pipeline.scene_detect import SceneDetector
|
| 4 |
+
from files.pipeline.frame_extract import FrameExtractor
|
| 5 |
+
from files.pipeline.audio_analysis import AudioAnalyzer
|
| 6 |
+
from files.pipeline.frame_analysis import FrameAnalyzer, HookAnalyzer
|
| 7 |
|
| 8 |
|
| 9 |
def run_pipeline(video_path: str):
|
ui/streamlit_app.py
CHANGED
|
@@ -33,12 +33,12 @@ if str(ROOT) not in sys.path:
|
|
| 33 |
sys.path.insert(0, str(ROOT))
|
| 34 |
|
| 35 |
# For debugging path issues
|
| 36 |
-
print("sys.path:", sys.path)
|
| 37 |
-
print("ROOT:", ROOT)
|
| 38 |
|
| 39 |
from config import make_path
|
| 40 |
-
from
|
| 41 |
-
from
|
| 42 |
|
| 43 |
# -----------------------------
|
| 44 |
# Storage layout
|
|
@@ -354,7 +354,7 @@ def _run_current_stage():
|
|
| 354 |
elif stage == "frame analysis":
|
| 355 |
push_status("Analyzing framesβ¦")
|
| 356 |
if st.session_state.openai_key and st.session_state.openai_key.strip():
|
| 357 |
-
from
|
| 358 |
try:
|
| 359 |
FrameAnalyzer(str(vp), openai_api_key=st.session_state.openai_key.strip()).analyze()
|
| 360 |
except Exception as api_error:
|
|
@@ -380,7 +380,7 @@ def _run_current_stage():
|
|
| 380 |
elif stage == "audio analysis":
|
| 381 |
push_status("Analyzing audioβ¦")
|
| 382 |
if st.session_state.gemini_key and st.session_state.gemini_key.strip():
|
| 383 |
-
from
|
| 384 |
try:
|
| 385 |
AudioAnalyzer(str(vp), gemini_api_key=st.session_state.gemini_key.strip()).analyze()
|
| 386 |
except (ValueError, Exception) as api_error:
|
|
@@ -406,7 +406,7 @@ def _run_current_stage():
|
|
| 406 |
elif stage == "hook analysis":
|
| 407 |
push_status("Evaluating hookβ¦")
|
| 408 |
if st.session_state.gemini_key and st.session_state.gemini_key.strip():
|
| 409 |
-
from
|
| 410 |
try:
|
| 411 |
HookAnalyzer(str(vp), gemini_api_key=st.session_state.gemini_key.strip()).analyze()
|
| 412 |
except (ValueError, Exception) as api_error:
|
|
@@ -432,7 +432,7 @@ def _run_current_stage():
|
|
| 432 |
elif stage == "report":
|
| 433 |
push_status("Generating final reportβ¦")
|
| 434 |
if st.session_state.openai_key and st.session_state.openai_key.strip():
|
| 435 |
-
from
|
| 436 |
try:
|
| 437 |
VideoReport(str(vp), openai_api_key=st.session_state.openai_key.strip()).generate()
|
| 438 |
except Exception as api_error:
|
|
|
|
| 33 |
sys.path.insert(0, str(ROOT))
|
| 34 |
|
| 35 |
# For debugging path issues
|
| 36 |
+
# print("sys.path:", sys.path)
|
| 37 |
+
# print("ROOT:", ROOT)
|
| 38 |
|
| 39 |
from config import make_path
|
| 40 |
+
from files.pipeline.scene_detect import SceneDetector
|
| 41 |
+
from files.pipeline.frame_extract import FrameExtractor
|
| 42 |
|
| 43 |
# -----------------------------
|
| 44 |
# Storage layout
|
|
|
|
| 354 |
elif stage == "frame analysis":
|
| 355 |
push_status("Analyzing framesβ¦")
|
| 356 |
if st.session_state.openai_key and st.session_state.openai_key.strip():
|
| 357 |
+
from files.pipeline.frame_analysis import FrameAnalyzer
|
| 358 |
try:
|
| 359 |
FrameAnalyzer(str(vp), openai_api_key=st.session_state.openai_key.strip()).analyze()
|
| 360 |
except Exception as api_error:
|
|
|
|
| 380 |
elif stage == "audio analysis":
|
| 381 |
push_status("Analyzing audioβ¦")
|
| 382 |
if st.session_state.gemini_key and st.session_state.gemini_key.strip():
|
| 383 |
+
from files.pipeline.audio_analysis import AudioAnalyzer
|
| 384 |
try:
|
| 385 |
AudioAnalyzer(str(vp), gemini_api_key=st.session_state.gemini_key.strip()).analyze()
|
| 386 |
except (ValueError, Exception) as api_error:
|
|
|
|
| 406 |
elif stage == "hook analysis":
|
| 407 |
push_status("Evaluating hookβ¦")
|
| 408 |
if st.session_state.gemini_key and st.session_state.gemini_key.strip():
|
| 409 |
+
from files.pipeline.frame_analysis import HookAnalyzer
|
| 410 |
try:
|
| 411 |
HookAnalyzer(str(vp), gemini_api_key=st.session_state.gemini_key.strip()).analyze()
|
| 412 |
except (ValueError, Exception) as api_error:
|
|
|
|
| 432 |
elif stage == "report":
|
| 433 |
push_status("Generating final reportβ¦")
|
| 434 |
if st.session_state.openai_key and st.session_state.openai_key.strip():
|
| 435 |
+
from files.pipeline.scoring import VideoReport
|
| 436 |
try:
|
| 437 |
VideoReport(str(vp), openai_api_key=st.session_state.openai_key.strip()).generate()
|
| 438 |
except Exception as api_error:
|