File size: 18,218 Bytes
eb09c29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
#!/usr/bin/env python3
import argparse
import json
import logging
from pathlib import Path
from typing import List, Tuple, Optional
import warnings

import numpy as np
from PIL import Image

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

try:
    import decord  # type: ignore
    _decord_error = None
except Exception as e:  # pragma: no cover
    _decord_error = e
    decord = None  # type: ignore

try:
    import cv2  # type: ignore
except Exception:  # pragma: no cover
    cv2 = None  # type: ignore

import torch
from transformers import AutoImageProcessor, TimesformerForVideoClassification

MODEL_ID = "facebook/timesformer-base-finetuned-k400"

def fix_numpy_compatibility():
    """Check and fix NumPy compatibility issues."""
    try:
        # Test basic numpy operations that are used in video processing
        test_array = np.array([1, 2, 3], dtype=np.float32)
        # Test stacking operations
        np.stack([test_array, test_array])

        # Test array creation and manipulation
        test_image_array = np.zeros((224, 224, 3), dtype=np.float32)
        test_video_array = np.stack([test_image_array, test_image_array], axis=0)

        # If we reach here, numpy is working
        logging.debug(f"NumPy {np.__version__} compatibility check passed")
        return True

    except Exception as e:
        logging.warning(f"NumPy compatibility issue: {e}")

        # For NumPy 2.x compatibility, try alternative approaches
        try:
            # Alternative stack operation that works with both versions
            test_list = [test_array, test_array]
            stacked = np.array(test_list)
            logging.info("Using NumPy 2.x compatible operations")
            return True
        except Exception as e2:
            logging.error(f"NumPy compatibility cannot be resolved: {e2}")
            return False

def _read_video_frames_decord(video_path: Path, num_frames: int) -> List[Image.Image]:
    """Read video frames using decord library."""
    vr = decord.VideoReader(str(video_path))
    total = len(vr)

    if total == 0:
        raise RuntimeError(f"Video has no frames: {video_path}")

    # Handle edge case where video has fewer frames than requested
    actual_num_frames = min(num_frames, total)
    if actual_num_frames <= 0:
        raise RuntimeError(f"Invalid frame count: {actual_num_frames}")

    indices = np.linspace(0, total - 1, num=actual_num_frames, dtype=int).tolist()

    try:
        frames = vr.get_batch(indices).asnumpy()
        return [Image.fromarray(frame) for frame in frames]
    except Exception as e:
        logging.warning(f"Decord batch read failed: {e}")
        # Fallback to individual frame reading
        frames = []
        for idx in indices:
            try:
                frame = vr[idx].asnumpy()
                frames.append(Image.fromarray(frame))
            except Exception:
                continue
        return frames

def _read_video_frames_cv2(video_path: Path, num_frames: int) -> List[Image.Image]:
    """Read video frames using OpenCV."""
    if cv2 is None:
        raise RuntimeError("OpenCV (opencv-python) is required if decord is not installed.")

    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise RuntimeError(f"Failed to open video: {video_path}")

    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total == 0:
        cap.release()
        raise RuntimeError(f"Video has no frames: {video_path}")

    # Handle edge case where video has fewer frames than requested
    actual_num_frames = min(num_frames, total)
    if actual_num_frames <= 0:
        raise RuntimeError(f"Invalid frame count: {actual_num_frames}")

    indices = np.linspace(0, max(total - 1, 0), num=actual_num_frames, dtype=int).tolist()

    result: List[Image.Image] = []
    current_idx = 0
    frame_pos_set_ok = hasattr(cv2, "CAP_PROP_POS_FRAMES")

    for target in indices:
        try:
            if frame_pos_set_ok:
                cap.set(cv2.CAP_PROP_POS_FRAMES, int(target))
                ok, frame = cap.read()
                if not ok:
                    continue
            else:
                # Fallback: read sequentially until we reach target
                while current_idx <= target:
                    ok, frame = cap.read()
                    if not ok:
                        break
                    current_idx += 1
                if not ok:
                    continue

            # Convert BGR->RGB and to PIL
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            result.append(Image.fromarray(frame_rgb))
        except Exception as e:
            logging.warning(f"Error reading frame {target}: {e}")
            continue

    cap.release()
    return result

def _read_video_frames(video_path: Path, num_frames: int) -> List[Image.Image]:
    """Read uniformly sampled frames using decord if available, otherwise OpenCV."""
    frames = []
    last_error = None

    # Try decord first
    if decord is not None:
        try:
            frames = _read_video_frames_decord(video_path, num_frames)
            if frames:
                logging.debug(f"Successfully read {len(frames)} frames using decord")
                return frames
        except Exception as e:
            last_error = e
            logging.warning(f"Decord failed: {e}")

    # Fallback to OpenCV
    try:
        frames = _read_video_frames_cv2(video_path, num_frames)
        if frames:
            logging.debug(f"Successfully read {len(frames)} frames using OpenCV")
            return frames
    except Exception as e:
        last_error = e
        logging.warning(f"OpenCV failed: {e}")

    # If both failed, raise the last error
    if last_error:
        raise RuntimeError(f"Failed to read video frames: {last_error}")
    else:
        raise RuntimeError("No video reading library available")

def normalize_frames(frames: List[Image.Image], required_frames: int, target_size: Tuple[int, int] = (224, 224)) -> List[Image.Image]:
    """Normalize frames to required count and size."""
    if not frames:
        raise RuntimeError("No frames to normalize")

    # Adjust frame count
    original_count = len(frames)
    if len(frames) < required_frames:
        # Pad by repeating frames cyclically
        padding_needed = required_frames - len(frames)
        for i in range(padding_needed):
            frames.append(frames[i % original_count])
        logging.info(f"Padded frames from {original_count} to {required_frames}")
    elif len(frames) > required_frames:
        # Uniformly sample frames
        indices = np.linspace(0, len(frames) - 1, num=required_frames, dtype=int)
        frames = [frames[i] for i in indices]
        logging.info(f"Sampled {required_frames} frames from {original_count}")

    # Normalize frame properties
    normalized_frames = []
    for i, frame in enumerate(frames):
        try:
            # Ensure RGB mode
            if frame.mode != 'RGB':
                frame = frame.convert('RGB')

            # Resize to target size
            if frame.size != target_size:
                frame = frame.resize(target_size, Image.Resampling.LANCZOS)

            normalized_frames.append(frame)
        except Exception as e:
            logging.error(f"Error normalizing frame {i}: {e}")
            # Create a black frame as fallback
            black_frame = Image.new('RGB', target_size, (0, 0, 0))
            normalized_frames.append(black_frame)

    return normalized_frames

def create_tensor_from_frames(frames: List[Image.Image], processor=None) -> torch.Tensor:
    """Create tensor from frames using multiple fallback strategies."""

    # Strategy 1: Use processor if available and working
    if processor is not None:
        strategies = [
            lambda: processor(images=frames, return_tensors="pt"),
            lambda: processor(videos=frames, return_tensors="pt"),
            lambda: processor(frames, return_tensors="pt"),
        ]

        for i, strategy in enumerate(strategies, 1):
            try:
                inputs = strategy()
                if 'pixel_values' in inputs:
                    tensor = inputs['pixel_values']
                    logging.info(f"Strategy {i} succeeded, tensor shape: {tensor.shape}")
                    return tensor
            except Exception as e:
                logging.debug(f"Processor strategy {i} failed: {e}")
                continue

    # Strategy 2: Direct PyTorch tensor creation (bypass numpy compatibility issues)
    try:
        logging.info("Using direct PyTorch tensor creation")

        # Convert frames directly to PyTorch tensors
        frame_tensors = []
        for i, frame in enumerate(frames):
            # Ensure frame is in the right format
            if frame.mode != 'RGB':
                frame = frame.convert('RGB')
            if frame.size != (224, 224):
                frame = frame.resize((224, 224), Image.Resampling.LANCZOS)

            # Get pixel data and reshape properly
            pixels = list(frame.getdata())
            logging.debug(f"Frame {i}: got {len(pixels)} pixels")

            # Create tensor with shape (height, width, channels)
            pixel_tensor = torch.tensor(pixels, dtype=torch.float32).view(224, 224, 3)
            pixel_tensor = pixel_tensor / 255.0  # Normalize to [0, 1]
            logging.debug(f"Frame {i} tensor shape: {pixel_tensor.shape}")
            frame_tensors.append(pixel_tensor)

        # Stack frames into video tensor: (num_frames, height, width, channels)
        video_tensor = torch.stack(frame_tensors, dim=0)
        logging.debug(f"Stacked tensor shape: {video_tensor.shape}")

        # Rearrange dimensions for TimeSformer: (batch, channels, num_frames, height, width)
        # Current: (num_frames=8, height=224, width=224, channels=3)
        # Target:  (batch=1, num_frames=8, channels=3, height=224, width=224)
        video_tensor = video_tensor.permute(0, 3, 1, 2)  # (frames, height, width, channels) -> (frames, channels, height, width)
        logging.debug(f"After first permute: {video_tensor.shape}")

        video_tensor = video_tensor.unsqueeze(0)  # (frames, channels, height, width) -> (1, frames, channels, height, width)
        logging.debug(f"After second permute and unsqueeze: {video_tensor.shape}")

        logging.info(f"Direct tensor creation succeeded, final shape: {video_tensor.shape}")
        return video_tensor

    except Exception as e:
        logging.debug(f"Direct tensor creation failed: {e}")

    # Strategy 3: Manual tensor creation with numpy fallback
    try:
        logging.info("Using numpy-based tensor creation")

        # Convert frames to numpy arrays
        frame_arrays = []
        for frame in frames:
            # Ensure frame is in the right format
            if frame.mode != 'RGB':
                frame = frame.convert('RGB')
            if frame.size != (224, 224):
                frame = frame.resize((224, 224), Image.Resampling.LANCZOS)

            # Convert to array and normalize
            frame_array = np.array(frame, dtype=np.float32)
            frame_array = frame_array / 255.0  # Normalize to [0, 1]
            frame_arrays.append(frame_array)

        # Stack frames: (num_frames, height, width, channels)
        try:
            video_array = np.stack(frame_arrays, axis=0)
        except Exception:
            # Fallback for compatibility issues
            video_array = np.array(frame_arrays)

        # Convert to PyTorch tensor
        video_tensor = torch.from_numpy(video_array)
        logging.debug(f"Numpy tensor initial shape: {video_tensor.shape}")

        # Rearrange dimensions for TimeSformer: (batch, num_frames, channels, height, width)
        # Current: (num_frames, height, width, channels)
        # Target:  (batch, num_frames, channels, height, width)
        video_tensor = video_tensor.permute(0, 3, 1, 2)  # (frames, height, width, channels) -> (frames, channels, height, width)
        video_tensor = video_tensor.unsqueeze(0)  # (frames, channels, height, width) -> (1, frames, channels, height, width)

        logging.info(f"Numpy tensor creation succeeded, shape: {video_tensor.shape}")
        return video_tensor

    except Exception as e:
        logging.debug(f"Numpy tensor creation failed: {e}")

    # Strategy 4: Pure Python fallback (slowest but most compatible)
    try:
        logging.info("Using pure Python tensor creation")

        # Convert frames to pure Python lists
        video_data = []
        for frame in frames:
            if frame.mode != 'RGB':
                frame = frame.convert('RGB')
            if frame.size != (224, 224):
                frame = frame.resize((224, 224), Image.Resampling.LANCZOS)

            # Get pixel data as list of RGB tuples
            pixels = list(frame.getdata())

            # Convert to 3D array structure: [height][width][channels]
            frame_data = []
            for row in range(224):
                row_data = []
                for col in range(224):
                    pixel_idx = row * 224 + col
                    r, g, b = pixels[pixel_idx]
                    row_data.append([r/255.0, g/255.0, b/255.0])  # Normalize
                frame_data.append(row_data)
            video_data.append(frame_data)

        # Convert to tensor
        video_tensor = torch.tensor(video_data, dtype=torch.float32)
        logging.debug(f"Pure Python tensor initial shape: {video_tensor.shape}")

        # Rearrange dimensions: (frames, height, width, channels) -> (batch, frames, channels, height, width)
        video_tensor = video_tensor.permute(0, 3, 1, 2)  # (frames, height, width, channels) -> (frames, channels, height, width)
        video_tensor = video_tensor.unsqueeze(0)  # (frames, channels, height, width) -> (1, frames, channels, height, width)

        logging.info(f"Pure Python tensor creation succeeded, shape: {video_tensor.shape}")
        return video_tensor

    except Exception as e:
        raise RuntimeError(f"All tensor creation strategies failed. Last error: {e}")

def load_model(device: Optional[str] = None):
    """Load the TimeSformer model and processor."""
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")

    try:
        logging.info("Loading TimeSformer model...")
        processor = AutoImageProcessor.from_pretrained(MODEL_ID)
        model = TimesformerForVideoClassification.from_pretrained(MODEL_ID)
        model.to(device)
        model.eval()
        logging.info(f"Model loaded successfully on {device}")
        return processor, model, device
    except Exception as e:
        logging.error(f"Failed to load model: {e}")
        raise RuntimeError(f"Model loading failed: {e}")

def predict_actions(video_path: str, top_k: int = 5) -> List[Tuple[str, float]]:
    """Run inference on a video and return top-k (label, score)."""

    # Check numpy compatibility first
    if not fix_numpy_compatibility():
        logging.warning("NumPy compatibility issues detected, but continuing with fallbacks")
        # Don't fail completely - try to continue with available functionality

    try:
        processor, model, device = load_model()
        required_frames = int(getattr(model.config, "num_frames", 8))

        logging.info(f"Processing video: {video_path}")
        logging.info(f"Required frames: {required_frames}")

        # Read video frames
        frames = _read_video_frames(Path(video_path), num_frames=required_frames)
        if not frames:
            raise RuntimeError("Could not extract any frames from the video")

        logging.info(f"Extracted {len(frames)} frames")

        # Normalize frames
        frames = normalize_frames(frames, required_frames)
        logging.info(f"Normalized to {len(frames)} frames")

        # Create tensor
        pixel_values = create_tensor_from_frames(frames, processor)

        # Move to device
        pixel_values = pixel_values.to(device)

        # Run inference
        logging.info("Running inference...")
        with torch.no_grad():
            outputs = model(pixel_values=pixel_values)
            logits = outputs.logits

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)[0]

            # Get top-k predictions
            scores, indices = torch.topk(probs, k=top_k)

            # Convert to labels
            results = []
            for score, idx in zip(scores.cpu(), indices.cpu()):
                label = model.config.id2label[idx.item()]
                results.append((label, float(score)))

            logging.info("Prediction completed successfully")
            return results

    except Exception as e:
        logging.error(f"Prediction failed: {e}")
        raise RuntimeError(f"Video processing error: {e}")

def main():
    """Command line interface."""
    parser = argparse.ArgumentParser(description="Predict actions in a video using TimeSformer")
    parser.add_argument("video", type=str, help="Path to input video file")
    parser.add_argument("--top-k", type=int, default=5, help="Top-k predictions to show")
    parser.add_argument("--json", action="store_true", help="Output JSON instead of text")
    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
    args = parser.parse_args()

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    try:
        preds = predict_actions(args.video, top_k=args.top_k)

        if args.json:
            print(json.dumps([{"label": l, "score": s} for l, s in preds], indent=2))
        else:
            print(f"\nTop {len(preds)} predictions for: {args.video}")
            print("-" * 50)
            for i, (label, score) in enumerate(preds, 1):
                print(f"{i:2d}. {label:<30} ({score:.3f})")

    except Exception as e:
        print(f"Error: {e}")
        return 1

    return 0

if __name__ == "__main__":
    exit(main())