|
|
|
|
|
|
|
|
<!doctype html> |
|
|
<html lang="en"> |
|
|
|
|
|
|
|
|
<head> |
|
|
<meta charset="utf-8" /> |
|
|
<meta name="viewport" content="width=device-width, initial-scale=1" /> |
|
|
<meta name="description" content="FineCVR." /> |
|
|
<meta name="author" content="FineCVR" /> |
|
|
<meta name="keywords" content="sFine-grained retrieval, dataset, decomposition method" /> |
|
|
|
|
|
<title>FineCVR</title> |
|
|
|
|
|
<link rel="canonical" href="https://may2333.github.io/" /> |
|
|
|
|
|
<link |
|
|
rel="stylesheet" |
|
|
href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap/5.3.3/css/bootstrap.min.css" |
|
|
integrity="sha512-jnSuA4Ss2PkkikSOLtYs8BlYIeeIK1h99ty4YfvRPAlzr377vr3CXDb7sb7eEEBYjDtcYj+AjBH3FLv5uSJuXg==" |
|
|
crossorigin="anonymous" |
|
|
referrerpolicy="no-referrer" |
|
|
/> |
|
|
<link href="FineCVR/css/grayscale.css" rel="stylesheet" /> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="preconnect" href="https://fonts.googleapis.com" /> |
|
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin /> |
|
|
<link |
|
|
href="https://fonts.googleapis.com/css2?family=Maven+Pro:wght@400..900&display=swap" |
|
|
rel="stylesheet" |
|
|
defer |
|
|
/> |
|
|
<link |
|
|
href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css" |
|
|
rel="stylesheet" |
|
|
integrity="sha512-DTOQO9RWCH3ppGqcWaEA1BIZOC6xxalwEsw9c2QQeAIftl+Vegovlnee1c9QX4TctnWMn13TZye+giMm8e2LwA==" |
|
|
crossorigin="anonymous" |
|
|
defer |
|
|
/> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<script |
|
|
type="text/javascript" |
|
|
src="https://www.termsfeed.com/public/cookie-consent/4.1.0/cookie-consent.js" |
|
|
charset="UTF-8" |
|
|
></script> |
|
|
<script type="text/javascript" charset="UTF-8"> |
|
|
document.addEventListener("DOMContentLoaded", function () { |
|
|
cookieconsent.run({ |
|
|
notice_banner_type: "simple", |
|
|
consent_type: "express", |
|
|
palette: "light", |
|
|
language: "en", |
|
|
page_load_consent_levels: ["strictly-necessary"], |
|
|
notice_banner_reject_button_hide: false, |
|
|
preferences_center_close_button_hide: false, |
|
|
page_refresh_confirmation_buttons: false, |
|
|
website_name: "https://may2333.github.io/FineCVR/", |
|
|
}); |
|
|
}); |
|
|
</script> |
|
|
|
|
|
|
|
|
|
|
|
<script |
|
|
type="text/plain" |
|
|
data-cookie-consent="tracking" |
|
|
async |
|
|
src="https://www.googletagmanager.com/gtag/js?id=UA-XXXXXXXX-X" |
|
|
></script> |
|
|
<script type="text/plain" data-cookie-consent="tracking"> |
|
|
window.dataLayer = window.dataLayer || []; |
|
|
function gtag(){dataLayer.push(arguments);} |
|
|
gtag('js', new Date()); |
|
|
gtag('config', 'UA-XXXXXXXX-X'); |
|
|
</script> |
|
|
|
|
|
</head> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<body |
|
|
id="page-top" |
|
|
data-spy="scroll" |
|
|
data-target=".navbar-fixed-top" |
|
|
data-offset="151" |
|
|
> |
|
|
|
|
|
|
|
|
<nav class="navbar navbar-expand-lg navbar-custom fixed-top" role="navigation" style="background-color: #249ebd;"> |
|
|
<style> |
|
|
.navbar-custom, |
|
|
.navbar-custom .navbar-brand, |
|
|
.navbar-custom .nav-link, |
|
|
.navbar-custom .navbar-text { |
|
|
color: white !important; |
|
|
} |
|
|
</style> |
|
|
<div class="container" > |
|
|
<a |
|
|
class="navbar-brand page-scroll" |
|
|
href="FineCVR.html#page-top" |
|
|
aria-controls="navbarNav" |
|
|
aria-label="Collapse navigation" |
|
|
> |
|
|
<div>FineCVR</div> |
|
|
</a> |
|
|
<button |
|
|
class="navbar-toggler" |
|
|
type="button" |
|
|
data-bs-toggle="collapse" |
|
|
data-bs-target="#navbarNav" |
|
|
aria-controls="navbarNav" |
|
|
aria-expanded="false" |
|
|
aria-label="Toggle navigation" |
|
|
> |
|
|
<i class="fa fa-bars"></i> |
|
|
</button> |
|
|
<div class="collapse navbar-collapse" id="navbarNav"> |
|
|
<ul class="navbar-nav ms-auto"> |
|
|
|
|
|
|
|
|
<li class="nav-item pe-2 ps-2"> |
|
|
<a class="nav-link page-scroll" href="FineCVR.html#abstract">Abstract</a> |
|
|
</li> |
|
|
|
|
|
<li class="nav-item pe-2 ps-2"> |
|
|
<a class="nav-link page-scroll" href="FineCVR.html#approach">Approach</a> |
|
|
</li> |
|
|
|
|
|
<li class="nav-item pe-2 ps-2"> |
|
|
<a class="nav-link page-scroll" href="FineCVR.html#dataset">Dateset</a> |
|
|
</li> |
|
|
|
|
|
<li class="nav-item pe-2 ps-2"> |
|
|
<a class="nav-link page-scroll" href="FineCVR.html#bibtex">BibTex</a> |
|
|
</li> |
|
|
|
|
|
|
|
|
</ul> |
|
|
</div> |
|
|
</div> |
|
|
</nav> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<header class="intro"> |
|
|
<div class="wrapper"> |
|
|
<span class="intro-text" style="font-size: 58px; font-weight: bolder;"> |
|
|
|
|
|
|
|
|
|
|
|
</span> |
|
|
</div> |
|
|
<div> |
|
|
<span style="font-size: 32px;">Yue Wu, Zhaobo Qi, Yiling Wu, Junshu Sun, Yaowei Wang, Shuhui Wang</span> |
|
|
</div> |
|
|
<a href="FineCVR.html#abstract" class="page-scroll"> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</a> |
|
|
<div class="button-container" style="display:flex;justify-content: center; align-items:center;"> |
|
|
<button class="button"> |
|
|
<i class="fa fa-file-pdf"></i> Paper |
|
|
</button> |
|
|
<button class="button" onclick="window.open('https://github.com/May2333/FDCA')"> |
|
|
<i class="fab fa-github"></i> Code |
|
|
</button> |
|
|
<button class="button" onclick="window.location.href='#dataset'"> |
|
|
<i class="fa fa-database"></i> Dataset |
|
|
</button> |
|
|
<button class="button" onclick="window.location.href='#bibtex'"> |
|
|
<i class="fas fa-file-alt"></i> BibTeX |
|
|
</button> |
|
|
</div> |
|
|
</header> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<section |
|
|
id="abstract" |
|
|
class="container content-section text-center" |
|
|
> |
|
|
<div class="row justify-content-center"> |
|
|
<div class="col-12"><h2>Abstract</h2> |
|
|
|
|
|
<div style="text-align: justify; margin-left: auto; margin-right: auto; background-color: white;padding: 50px;"> |
|
|
With the explosive growth of video data, finding videos that meet detailed requirements in large datasets has become a challenge. To address this, the composed video retrieval task has been introduced, enabling users to retrieve videos using complex queries that involve both visual and textual information. |
|
|
However, the inherent heterogeneity between modalities poses significant challenges. Textual data is highly abstract, while video content contains substantial redundancy. This modality gap in information representation makes existing methods struggle with the fine-grained fusion and alignment required for fine-grained composed retrieval. |
|
|
To overcome these challenges, we introduce <strong>FineCVR-1M</strong>, a fine-grained composed video retrieval dataset containing 1,010,071 video-text triplets with detailed textual descriptions. This dataset is constructed through an automated process that identifies key concept changes between video pairs to generate textual descriptions for both static and action concepts. |
|
|
For fine-grained retrieval methods, the key challenge lies in understanding the detailed requirements. Text descriptions serve as clear expressions of intent, allowing models to distinguish fine-grained needs through textual feature disentanglement. |
|
|
Therefore, we propose a textual Feature Disentanglement and Cross-modal Alignment framework <strong>FDCA</strong> that disentangles features at both the sentence and token levels. At the sequence level, we separate text features into retained and injected features. |
|
|
At the token level, an Auxiliary Token Disentangling mechanism is proposed to disentangle texts into retained, injected, and excluded tokens. The disentanglement at both levels extracts fine-grained features, which are aligned and fused with reference video to extract global representations for video retrieval. |
|
|
Experiments on FineCVR-1M dataset demonstrate the superior performance of FDCA. |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<section |
|
|
id="approach" |
|
|
class="container content-section text-center" |
|
|
> |
|
|
<div class="row justify-content-center"> |
|
|
<div class="col-12"><h2 id="approach">Approach</h2> |
|
|
|
|
|
<div style="background-color: white;"> |
|
|
|
|
|
<img src="FineCVR/img/pipeline.svg" alt="SVG Image" width="800" height="800" /> |
|
|
<div style="text-align: justify; margin-left: auto; margin-right: auto;padding: 50px;"> |
|
|
Overall pipeline of FDCA involves fine-grained cross-modal alignment and fusion through the disentangling of text features. We further enhance this process by introducing token-level disentangling, where clustering is used to generate three types of features, enabling the model to focus on fine-grained information. |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<section |
|
|
id="dataset" |
|
|
class="container content-section text-center" |
|
|
> |
|
|
<div class="row justify-content-center"> |
|
|
<div class="col-12"><h2 id="dataset">Dataset</h2> |
|
|
<div style="background-color: white;padding: 50px;"> |
|
|
<div style="text-align: justify; margin-left: auto; margin-right: auto;"> |
|
|
You can download the FineCVR-1M dataset from the following links: |
|
|
<br /> |
|
|
<div class="button-container" style="display:flex;justify-content: center; align-items:center;"> |
|
|
<button class="button" onclick="window.open('https://drive.google.com/file/d/1-2JVLN8i06IB30ub8v-iqo3jI9u5V_2Q/view?usp=drive_link')"> |
|
|
<i class="fa fa-cloud-download"></i> CLIP Embeddings |
|
|
</button> |
|
|
<button class="button" onclick="window.open('https://drive.google.com/file/d/17ZI4bQg0CTxwuW0-BbvuekCuhhjTxikP/view?usp=drive_link')"> |
|
|
<i class="fa fa-cloud-download"></i> BLIP Embeddings |
|
|
</button> |
|
|
<button class="button" onclick="window.open('https://drive.google.com/drive/folders/1SneQu9pUhvWmehGxn_Y8YB0JGaa-XfAv?usp=drive_link')"> |
|
|
<i class="fa fa-cloud-download"></i> Annotations |
|
|
</button> |
|
|
<button class="button" onclick="window.open('https://pan.baidu.com/s/1uSRsdYeOhBVnPLj04cv3ZA?pwd=mwyq')"> |
|
|
<i class="fa fa-cloud-download"></i> Frames |
|
|
</button> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<br /> |
|
|
<div style="display:flex;justify-content: center; align-items:center;"> |
|
|
Example tuples in FineCVR-1M dataset. |
|
|
</div> |
|
|
|
|
|
|
|
|
<img src="FineCVR/img/good_case.svg" alt="SVG Image" width="1000" height="800" /> |
|
|
|
|
|
<div style="text-align: justify; margin-left: auto; margin-right: auto;"> |
|
|
The videos on the left represent reference videos, |
|
|
while those on the right represent target videos. The modification texts at the bottom highlight the differences between the two videos. |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<section |
|
|
id="bibtex" |
|
|
class="container content-section text-center" |
|
|
> |
|
|
<div class="row justify-content-center"> |
|
|
<div class="col-12"><h2 id="bibtex">BibTeX</h2> |
|
|
<div style="text-align: justify; margin-left: auto; margin-right: auto; padding: 50px;background-color: white;"> |
|
|
If you use our dataset or method in your research, please cite our paper: |
|
|
|
|
|
<pre> |
|
|
@article{yue25finecvr, |
|
|
title = {Learning Fine-Grained Representations through Textual Token Disentanglement in Composed Video Retrieval}, |
|
|
author = {Yue Wu, Zhaobo Qi, Yiling Wu, Junshu Sun, Yaowei Wang, Shuhui Wang}, |
|
|
journal = {ICLR}, |
|
|
year = {2025} |
|
|
} |
|
|
</pre> |
|
|
|
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<footer> |
|
|
<div class="container text-center pt-5"> |
|
|
<p class="mb-0">Copyright © FineCVR 2025</p> |
|
|
</div> |
|
|
</footer> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<script |
|
|
src="https://cdnjs.cloudflare.com/ajax/libs/jquery/1.12.4/jquery.min.js" |
|
|
integrity="sha512-jGsMH83oKe9asCpkOVkBnUrDDTp8wl+adkB2D+//JtlxO4SrLoJdhbOysIFQJloQFD+C4Fl1rMsQZF76JjV0eQ==" |
|
|
crossorigin="anonymous" |
|
|
></script> |
|
|
|
|
|
<script |
|
|
src="https://cdnjs.cloudflare.com/ajax/libs/bootstrap/5.3.3/js/bootstrap.bundle.min.js" |
|
|
integrity="sha512-7Pi/otdlbbCR+LnW+F7PwFcSDJOuUJB3OxtEHbg4vSMvzvJjde4Po1v4BR9Gdc9aXNUNFVUY+SK51wWT8WF0Gg==" |
|
|
crossorigin="anonymous" |
|
|
referrerpolicy="no-referrer" |
|
|
></script> |
|
|
|
|
|
<script |
|
|
src="https://cdnjs.cloudflare.com/ajax/libs/jquery-easing/1.3/jquery.easing.min.js" |
|
|
integrity="sha512-ahmSZKApTDNd3gVuqL5TQ3MBTj8tL5p2tYV05Xxzcfu6/ecvt1A0j6tfudSGBVuteSoTRMqMljbfdU0g2eDNUA==" |
|
|
crossorigin="anonymous" |
|
|
></script> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<script> |
|
|
function toggleNavCollapse() { |
|
|
50 < $(".navbar").offset().top |
|
|
? $(".fixed-top").addClass("top-nav-collapse") |
|
|
: $(".fixed-top").removeClass("top-nav-collapse"); |
|
|
} |
|
|
$(document).ready(toggleNavCollapse); |
|
|
$(window).scroll(toggleNavCollapse); |
|
|
$(function () { |
|
|
$("a.page-scroll").bind("click", function (b) { |
|
|
var a = $(this); |
|
|
$("html, body") |
|
|
.stop() |
|
|
.animate( |
|
|
{ scrollTop: $(a.attr("href")).offset().top - 50 }, |
|
|
100, |
|
|
"easeInOutExpo", |
|
|
function () { |
|
|
a.blur(); |
|
|
}, |
|
|
); |
|
|
b.preventDefault(); |
|
|
}); |
|
|
}); |
|
|
$(".navbar-collapse ul li a").click(function () { |
|
|
$(".navbar-toggler:visible").click(); |
|
|
}); |
|
|
</script> |
|
|
|
|
|
|
|
|
|
|
|
<script> |
|
|
document |
|
|
.querySelector(".navbar-brand") |
|
|
.addEventListener("click", function () { |
|
|
const navbarCollapse = document.getElementById("navbarNav"); |
|
|
|
|
|
if (navbarCollapse.classList.contains("show")) { |
|
|
const collapseInstance = |
|
|
bootstrap.Collapse.getInstance(navbarCollapse) || |
|
|
new bootstrap.Collapse(navbarCollapse); |
|
|
collapseInstance.hide(); |
|
|
} |
|
|
}); |
|
|
</script> |
|
|
|
|
|
|
|
|
|
|
|
<script> |
|
|
document.addEventListener("click", function (event) { |
|
|
const navbar = document.querySelector(".navbar-collapse"); |
|
|
const isClickInsideNavbar = navbar.contains(event.target); |
|
|
const isNavbarOpen = navbar.classList.contains("show"); |
|
|
|
|
|
if (!isClickInsideNavbar && isNavbarOpen) { |
|
|
const collapseInstance = |
|
|
bootstrap.Collapse.getInstance(navbar) || |
|
|
new bootstrap.Collapse(navbar); |
|
|
collapseInstance.hide(); |
|
|
} |
|
|
}); |
|
|
</script> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<script |
|
|
type="text/javascript" |
|
|
src="https://cdnjs.cloudflare.com/ajax/libs/typed.js/1.1.7/typed.min.js" |
|
|
></script> |
|
|
<script type="text/javascript"> |
|
|
$(".intro-text").typed({ |
|
|
strings: ["Learning Fine-Grained Representations through <br >Textual Token Disentanglement in Composed Video Retrieval"], |
|
|
typeSpeed: 3, |
|
|
startDelay: 1000, |
|
|
loop: false, |
|
|
loopCount: 1, |
|
|
cursorChar: "|", |
|
|
showCursor: false, |
|
|
}); |
|
|
</script> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<script type="text/javascript"> |
|
|
var disqus_shortname = "personal-jekyll-theme"; |
|
|
|
|
|
(function () { |
|
|
var s = document.createElement("script"); |
|
|
s.async = true; |
|
|
s.type = "text/javascript"; |
|
|
s.src = "//" + disqus_shortname + ".disqus.com/count.js"; |
|
|
( |
|
|
document.getElementsByTagName("HEAD")[0] || |
|
|
document.getElementsByTagName("BODY")[0] |
|
|
).appendChild(s); |
|
|
})(); |
|
|
</script> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</body> |
|
|
</html> |
|
|
|
|
|
|
|
|
|