YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)

InternVideo ECVA tuned head

Base backbone: revliter/internvideo_next_large_p14_res224_f16
Clip length: 16 frames
Frame size: 224x224
Head hidden dims: [512]
Repo: happy8825/internvideo_tuned

Quick start (single video)

pip install decord transformers huggingface_hub
python inference_example.py --repo_id happy8825/internvideo_tuned --video /path/to/video.mp4 --device cuda

The script downloads this repo, loads the InternVideo backbone + tuned head, and prints normal or abnormal.

Minimal Python snippet

import json, os, numpy as np, torch
from huggingface_hub import snapshot_download
from transformers import VideoMAEImageProcessor, AutoModel
from decord import VideoReader

ID2LABEL = {0: "normal", 1: "abnormal"}

class ClassificationHead(torch.nn.Module):
    def __init__(self, in_dim, hidden_dims, num_labels=2, dropout=0.1):
        super().__init__()
        dims = [in_dim] + list(hidden_dims)
        layers = []
        for i in range(len(dims) - 1):
            layers += [torch.nn.Linear(dims[i], dims[i+1]), torch.nn.GELU(), torch.nn.Dropout(dropout)]
        layers.append(torch.nn.Linear(dims[-1], num_labels))
        self.net = torch.nn.Sequential(*layers)
    def forward(self, x): return self.net(x)

def pool_tokens(feats, expected=None):
    if feats.dim() != 3: return feats
    _, d1, d2 = feats.shape
    if expected:
        if d1 == expected: return feats.mean(dim=2)
        if d2 == expected: return feats.mean(dim=1)
    return feats.mean(dim=2 if d1 <= d2 else 1)

repo = "happy8825/internvideo_tuned"
local = snapshot_download(repo)
cfg = json.load(open(os.path.join(local, "train_config.json")))
base = cfg.get("base_model", "revliter/internvideo_next_large_p14_res224_f16")
clip_len = int(cfg.get("clip_len", 16))
hidden = cfg.get("hidden", [512])
feat_dim = cfg.get("feature_dim") or cfg.get("hidden_size")

processor = VideoMAEImageProcessor.from_pretrained(base)
backbone = AutoModel.from_pretrained(base, trust_remote_code=True).eval().to("cuda")
head = ClassificationHead(in_dim=feat_dim or backbone.config.hidden_size, hidden_dims=hidden)
state = torch.load(os.path.join(local, "best_head.pt"), map_location="cpu")
head.load_state_dict(state["head"]); head.eval().to("cuda")

vr = VideoReader("/path/to/video.mp4")
idxs = np.linspace(0, len(vr)-1, num=clip_len, dtype=int)
frames = [vr[i].asnumpy() for i in idxs]
px = processor(frames, return_tensors="pt")["pixel_values"].permute(0,2,1,3,4).to("cuda")
with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.bfloat16):
    feats = backbone.extract_features(pixel_values=px)
pooled = pool_tokens(feats, expected=feat_dim)
pred = int(head(pooled.float()).argmax(dim=-1).item())
print(ID2LABEL.get(pred, pred))

Files

best_head.pt: classifier head weights
train_config.json: training config (contains base model, clip_len, frame_size, hidden dims, etc.)
inference_example.py: minimal inference helper

Downloads last month: -; Downloads are not tracked for this model. How to track

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support