Spaces:
Runtime error
Runtime error
| #from .custom_layers import TransformerEncoder, PositionalEmbedding | |
| from .constants import MAX_SEQ_LENGTH, NUM_FEATURES, IMG_SIZE, CLASS_VOCAB | |
| from huggingface_hub import from_pretrained_keras | |
| from tensorflow import keras | |
| from keras import layers | |
| import numpy as np | |
| import imageio | |
| import cv2 | |
| #model = from_pretrained_keras("shivi/video-classification",custom_objects={"PositionalEmbedding":PositionalEmbedding,"TransformerEncoder": TransformerEncoder}) | |
| model = from_pretrained_keras("keras-io/video-transformers") | |
| """ | |
| Below code is taken from the Video-Transformers example on keras-io by Sayak Paul | |
| """ | |
| def build_feature_extractor(): | |
| feature_extractor = keras.applications.DenseNet121( | |
| weights="imagenet", | |
| include_top=False, | |
| pooling="avg", | |
| input_shape=(IMG_SIZE, IMG_SIZE, 3), | |
| ) | |
| preprocess_input = keras.applications.densenet.preprocess_input | |
| inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3)) | |
| preprocessed = preprocess_input(inputs) | |
| outputs = feature_extractor(preprocessed) | |
| return keras.Model(inputs, outputs, name="feature_extractor") | |
| feature_extractor = build_feature_extractor() | |
| def crop_center(frame): | |
| center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE) | |
| cropped = center_crop_layer(frame[None, ...]) | |
| cropped = cropped.numpy().squeeze() | |
| return cropped | |
| def load_video(path, max_frames=0): | |
| cap = cv2.VideoCapture(path) | |
| frames = [] | |
| try: | |
| while True: | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| frame = crop_center(frame) | |
| frame = frame[:, :, [2, 1, 0]] | |
| frames.append(frame) | |
| if len(frames) == max_frames: | |
| break | |
| finally: | |
| cap.release() | |
| return np.array(frames) | |
| def prepare_single_video(frames): | |
| frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32") | |
| # Pad shorter videos. | |
| if len(frames) < MAX_SEQ_LENGTH: | |
| diff = MAX_SEQ_LENGTH - len(frames) | |
| padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3)) | |
| frames = np.concatenate(frames, padding) | |
| frames = frames[None, ...] | |
| # Extract features from the frames of the current video. | |
| for i, batch in enumerate(frames): | |
| video_length = batch.shape[0] | |
| length = min(MAX_SEQ_LENGTH, video_length) | |
| for j in range(length): | |
| if np.mean(batch[j, :]) > 0.0: | |
| frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :]) | |
| else: | |
| frame_features[i, j, :] = 0.0 | |
| return frame_features | |
| def predict_action(path): | |
| frames = load_video(path) | |
| frame_features = prepare_single_video(frames) | |
| probabilities = model.predict(frame_features)[0] | |
| confidences = {} | |
| for i in np.argsort(probabilities)[::-1]: | |
| confidences[CLASS_VOCAB[i]] = float(probabilities[i]) | |
| gif_out = to_gif(frames[:MAX_SEQ_LENGTH]) | |
| print(confidences) | |
| return confidences, gif_out | |
| def to_gif(images): | |
| converted_images = images.astype(np.uint8) | |
| imageio.mimsave("animation.gif", converted_images, fps=10) | |
| return "animation.gif" | |