精确与近似寻址模式：性能与准确性对比

精确与近似寻址模式：性能与准确性对比#

本示例介绍 :class:torchcodec.decoders.VideoDecoder 的 seek_mode 参数。该参数在解码器创建速度与帧寻址准确性之间做权衡（例如在近似模式下，请求第 i 帧不一定返回第 i 帧）。

准备：下载短视频并生成长视频#

我们从网络下载一个约 13 秒的短视频，并用 ffmpeg 将其循环 100 次，得到一个约 20 分钟的长视频。

import torch
import httpx
import tempfile
from pathlib import Path
import shutil
import subprocess
from time import perf_counter_ns

# 视频来源: https://www.pexels.com/video/dog-eating-854132/  许可: CC0  作者: Coverr
url = "https://videos.pexels.com/video-files/854132/854132-sd_640_360_25fps.mp4"
headers = {"User-Agent": ""}

temp_dir = tempfile.mkdtemp()
short_video_path = Path(temp_dir) / "short_video.mp4"
with httpx.stream("GET", url, headers=headers, follow_redirects=True) as r:
    if r.status_code != 200:
        raise RuntimeError(f"Failed to download video. status_code = {r.status_code}.")
    with open(short_video_path, 'wb') as f:
        for chunk in r.iter_bytes():
            if chunk:
                f.write(chunk)

long_video_path = Path(temp_dir) / "long_video.mp4"
ffmpeg_command = [
    "ffmpeg",
    "-stream_loop", "99",  # 重复 100 次
    "-i", f"{short_video_path}",
    "-c", "copy",
    f"{long_video_path}"
]
subprocess.run(ffmpeg_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

from torchcodec.decoders import VideoDecoder
print(f"短视频时长: {VideoDecoder(short_video_path).metadata.duration_seconds} 秒")
print(f"长视频时长: {VideoDecoder(long_video_path).metadata.duration_seconds / 60} 分钟")

短视频时长: 13.8 秒
长视频时长: 23.0 分钟

性能：解码器创建耗时#

seek_mode 最直接影响的是 :class:torchcodec.decoders.VideoDecoder 的创建耗时；视频越长，近似模式的收益越明显。

def bench(f, average_over=50, warmup=2, **f_kwargs):
    for _ in range(warmup):
        f(**f_kwargs)
    times = []
    for _ in range(average_over):
        start = perf_counter_ns()
        f(**f_kwargs)
        end = perf_counter_ns()
        times.append(end - start)
    times = torch.tensor(times) * 1e-6
    std = times.std().item()
    med = times.median().item()
    print(f"{med = :.2f}ms +- {std:.2f}")

print("在短视频上创建 seek_mode='exact' 的解码器:")
bench(VideoDecoder, source=short_video_path, seek_mode="exact")
print("在短视频上创建 seek_mode='approximate' 的解码器:")
bench(VideoDecoder, source=short_video_path, seek_mode="approximate")
print()
print("在长视频上创建 seek_mode='exact' 的解码器:")
bench(VideoDecoder, source=long_video_path, seek_mode="exact")
print("在长视频上创建 seek_mode='approximate' 的解码器:")
bench(VideoDecoder, source=long_video_path, seek_mode="approximate")

在短视频上创建 seek_mode='exact' 的解码器:
med = 4.58ms +- 0.44
在短视频上创建 seek_mode='approximate' 的解码器:
med = 4.13ms +- 0.45

在长视频上创建 seek_mode='exact' 的解码器:
med = 49.23ms +- 2.61
在长视频上创建 seek_mode='approximate' 的解码器:
med = 5.31ms +- 0.86

性能：帧解码与片段采样#

严格来说，seek_mode 只影响解码器创建本身；并不直接影响解码或采样。但实际流程往往为每个视频先创建解码器，因此它会间接影响总体耗时。

from torchcodec import samplers

def sample_clips(seek_mode):
    return samplers.clips_at_random_indices(
        decoder=VideoDecoder(
            source=long_video_path,
            seek_mode=seek_mode
        ),
        num_clips=5,
        num_frames_per_clip=2,
    )

print("使用 seek_mode='exact' 进行片段采样:")
bench(sample_clips, seek_mode="exact")
print("使用 seek_mode='approximate' 进行片段采样:")
bench(sample_clips, seek_mode="approximate")

使用 seek_mode='exact' 进行片段采样:
med = 131.16ms +- 16.07
使用 seek_mode='approximate' 进行片段采样:
med = 88.62ms +- 23.35

准确性：元数据与帧获取#

seek_mode="approximate" 能显著加速创建，但代价是寻址不如精确模式准确，也可能影响元数据的精确性。很多情况下两者没有差异，此时近似模式是 "净收益"。

print("短视频元数据（exact）:")
print(VideoDecoder(short_video_path, seek_mode="exact").metadata)
print("短视频元数据（approximate）:")
print(VideoDecoder(short_video_path, seek_mode="approximate").metadata)

exact_decoder = VideoDecoder(short_video_path, seek_mode="exact")
approx_decoder = VideoDecoder(short_video_path, seek_mode="approximate")
for i in range(len(exact_decoder)):
    torch.testing.assert_close(
        exact_decoder.get_frame_at(i).data,
        approx_decoder.get_frame_at(i).data,
        atol=0, rtol=0,
    )
print("该视频上，两种模式的帧寻址一致！")

原理简述#

当 seek_mode="exact" 时，解码器在初始化阶段会进行一次 "扫描"：不解码整段文件，但处理整个文件以获得更精确的元数据（如时长），并构建帧与关键帧的内部索引。该索引可能比文件头中的更准确，从而提升寻址准确性。若不扫描，TorchCodec 仅依赖文件自身元数据，其准确性可能不佳。

选择建议#

若非常在意帧寻址的严格精确性，使用 exact。
若为速度可牺牲部分寻址精度（如片段采样），使用 approximate。
若视频无可变帧率且元数据正确，approximate 通常与 exact 一样准确但更快。

清理临时资源#

shutil.rmtree(temp_dir)