Mqleet's picture
[update] templates
a3d3755
raw
history blame
17.1 kB
<!-- Index Layout Start -->
<!doctype html>
<html lang="en">
<!-- Head Start -->
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="description" content="FineCVR." />
<meta name="author" content="FineCVR" />
<meta name="keywords" content="sFine-grained retrieval, dataset, decomposition method" />
<title>FineCVR</title>
<link rel="canonical" href="https://may2333.github.io/" />
<link
rel="stylesheet"
href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap/5.3.3/css/bootstrap.min.css"
integrity="sha512-jnSuA4Ss2PkkikSOLtYs8BlYIeeIK1h99ty4YfvRPAlzr377vr3CXDb7sb7eEEBYjDtcYj+AjBH3FLv5uSJuXg=="
crossorigin="anonymous"
referrerpolicy="no-referrer"
/>
<link href="FineCVR/css/grayscale.css" rel="stylesheet" />
<!-- Fonts -->
<link rel="preconnect" href="https://fonts.googleapis.com" />
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
<link
href="https://fonts.googleapis.com/css2?family=Maven+Pro:wght@400..900&display=swap"
rel="stylesheet"
defer
/>
<link
href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.1/css/all.min.css"
rel="stylesheet"
integrity="sha512-DTOQO9RWCH3ppGqcWaEA1BIZOC6xxalwEsw9c2QQeAIftl+Vegovlnee1c9QX4TctnWMn13TZye+giMm8e2LwA=="
crossorigin="anonymous"
defer
/>
<!-- Cookie Consent by TermsFeed https://www.TermsFeed.com -->
<script
type="text/javascript"
src="https://www.termsfeed.com/public/cookie-consent/4.1.0/cookie-consent.js"
charset="UTF-8"
></script>
<script type="text/javascript" charset="UTF-8">
document.addEventListener("DOMContentLoaded", function () {
cookieconsent.run({
notice_banner_type: "simple",
consent_type: "express",
palette: "light",
language: "en",
page_load_consent_levels: ["strictly-necessary"],
notice_banner_reject_button_hide: false,
preferences_center_close_button_hide: false,
page_refresh_confirmation_buttons: false,
website_name: "https://may2333.github.io/FineCVR/",
});
});
</script>
<!-- Google Analytics -->
<script
type="text/plain"
data-cookie-consent="tracking"
async
src="https://www.googletagmanager.com/gtag/js?id=UA-XXXXXXXX-X"
></script>
<script type="text/plain" data-cookie-consent="tracking">
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'UA-XXXXXXXX-X');
</script>
</head>
<!-- Head End -->
<body
id="page-top"
data-spy="scroll"
data-target=".navbar-fixed-top"
data-offset="151"
>
<!-- Navigation Start -->
<nav class="navbar navbar-expand-lg navbar-custom fixed-top" role="navigation" style="background-color: #249ebd;">
<style>
.navbar-custom,
.navbar-custom .navbar-brand,
.navbar-custom .nav-link,
.navbar-custom .navbar-text {
color: white !important;
}
</style>
<div class="container" >
<a
class="navbar-brand page-scroll"
href="FineCVR.html#page-top"
aria-controls="navbarNav"
aria-label="Collapse navigation"
>
<div>FineCVR</div>
</a>
<button
class="navbar-toggler"
type="button"
data-bs-toggle="collapse"
data-bs-target="#navbarNav"
aria-controls="navbarNav"
aria-expanded="false"
aria-label="Toggle navigation"
>
<i class="fa fa-bars"></i>
</button>
<div class="collapse navbar-collapse" id="navbarNav">
<ul class="navbar-nav ms-auto">
<li class="nav-item pe-2 ps-2">
<a class="nav-link page-scroll" href="FineCVR.html#abstract">Abstract</a>
</li>
<li class="nav-item pe-2 ps-2">
<a class="nav-link page-scroll" href="FineCVR.html#approach">Approach</a>
</li>
<li class="nav-item pe-2 ps-2">
<a class="nav-link page-scroll" href="FineCVR.html#dataset">Dateset</a>
</li>
<li class="nav-item pe-2 ps-2">
<a class="nav-link page-scroll" href="FineCVR.html#bibtex">BibTex</a>
</li>
</ul>
</div>
</div>
</nav>
<!-- Navigation End -->
<!-- Intro Start -->
<header class="intro">
<div class="wrapper">
<span class="intro-text" style="font-size: 58px; font-weight: bolder;"><!--
If you want to have a static message in your intro layout,
disable the dynamic-typing in the _config.yml and write here your text.
-->
</span>
</div>
<div>
<span style="font-size: 32px;">Yue Wu, Zhaobo Qi, Yiling Wu, Junshu Sun, Yaowei Wang, Shuhui Wang</span>
</div>
<a href="FineCVR.html#abstract" class="page-scroll">
<!-- -->
<!-- <img
class="img-me"
src="/FineCVR/img/jetpacktocat.png"
alt=""
/>
-->
</a>
<div class="button-container" style="display:flex;justify-content: center; align-items:center;">
<button class="button">
<i class="fa fa-file-pdf"></i> Paper
</button>
<button class="button" onclick="window.open('https://github.com/May2333/FDCA')">
<i class="fab fa-github"></i> Code
</button>
<button class="button" onclick="window.location.href='#dataset'">
<i class="fa fa-database"></i> Dataset
</button>
<button class="button" onclick="window.location.href='#bibtex'">
<i class="fas fa-file-alt"></i> BibTeX
</button>
</div>
</header>
<!-- Intro End -->
<!-- About Start -->
<section
id="abstract"
class="container content-section text-center"
>
<div class="row justify-content-center">
<div class="col-12"><h2>Abstract</h2>
<div style="text-align: justify; margin-left: auto; margin-right: auto; background-color: white;padding: 50px;">
With the explosive growth of video data, finding videos that meet detailed requirements in large datasets has become a challenge. To address this, the composed video retrieval task has been introduced, enabling users to retrieve videos using complex queries that involve both visual and textual information.
However, the inherent heterogeneity between modalities poses significant challenges. Textual data is highly abstract, while video content contains substantial redundancy. This modality gap in information representation makes existing methods struggle with the fine-grained fusion and alignment required for fine-grained composed retrieval.
To overcome these challenges, we introduce <strong>FineCVR-1M</strong>, a fine-grained composed video retrieval dataset containing 1,010,071 video-text triplets with detailed textual descriptions. This dataset is constructed through an automated process that identifies key concept changes between video pairs to generate textual descriptions for both static and action concepts.
For fine-grained retrieval methods, the key challenge lies in understanding the detailed requirements. Text descriptions serve as clear expressions of intent, allowing models to distinguish fine-grained needs through textual feature disentanglement.
Therefore, we propose a textual Feature Disentanglement and Cross-modal Alignment framework <strong>FDCA</strong> that disentangles features at both the sentence and token levels. At the sequence level, we separate text features into retained and injected features.
At the token level, an Auxiliary Token Disentangling mechanism is proposed to disentangle texts into retained, injected, and excluded tokens. The disentanglement at both levels extracts fine-grained features, which are aligned and fused with reference video to extract global representations for video retrieval.
Experiments on FineCVR-1M dataset demonstrate the superior performance of FDCA.
</div>
</div>
</div>
</section>
<!-- About End -->
<!-- About Start -->
<section
id="approach"
class="container content-section text-center"
>
<div class="row justify-content-center">
<div class="col-12"><h2 id="approach">Approach</h2>
<div style="background-color: white;">
&nbsp;
<img src="FineCVR/img/pipeline.svg" alt="SVG Image" width="800" height="800" />
<div style="text-align: justify; margin-left: auto; margin-right: auto;padding: 50px;">
Overall pipeline of FDCA involves fine-grained cross-modal alignment and fusion through the disentangling of text features. We further enhance this process by introducing token-level disentangling, where clustering is used to generate three types of features, enabling the model to focus on fine-grained information.
</div>
</div>
</div>
</div>
</section>
<!-- About End -->
<!-- About Start -->
<section
id="dataset"
class="container content-section text-center"
>
<div class="row justify-content-center">
<div class="col-12"><h2 id="dataset">Dataset</h2>
<div style="background-color: white;padding: 50px;">
<div style="text-align: justify; margin-left: auto; margin-right: auto;">
You can download the FineCVR-1M dataset from the following links:
<br />&nbsp;
<div class="button-container" style="display:flex;justify-content: center; align-items:center;">
<button class="button" onclick="window.open('https://drive.google.com/file/d/1-2JVLN8i06IB30ub8v-iqo3jI9u5V_2Q/view?usp=drive_link')">
<i class="fa fa-cloud-download"></i> CLIP Embeddings
</button>
<button class="button" onclick="window.open('https://drive.google.com/file/d/17ZI4bQg0CTxwuW0-BbvuekCuhhjTxikP/view?usp=drive_link')">
<i class="fa fa-cloud-download"></i> BLIP Embeddings
</button>
<button class="button" onclick="window.open('https://drive.google.com/drive/folders/1SneQu9pUhvWmehGxn_Y8YB0JGaa-XfAv?usp=drive_link')">
<i class="fa fa-cloud-download"></i> Annotations
</button>
<button class="button" onclick="window.open('https://pan.baidu.com/s/1uSRsdYeOhBVnPLj04cv3ZA?pwd=mwyq')">
<i class="fa fa-cloud-download"></i> Frames
</button>
</div>
</div>
<br />
<div style="display:flex;justify-content: center; align-items:center;">
Example tuples in FineCVR-1M dataset.
</div>
<img src="FineCVR/img/good_case.svg" alt="SVG Image" width="1000" height="800" />
<div style="text-align: justify; margin-left: auto; margin-right: auto;">
The videos on the left represent reference videos,
while those on the right represent target videos. The modification texts at the bottom highlight the differences between the two videos.
</div>
</div>
</div>
</div>
</section>
<!-- About End -->
<!-- About Start -->
<section
id="bibtex"
class="container content-section text-center"
>
<div class="row justify-content-center">
<div class="col-12"><h2 id="bibtex">BibTeX</h2>
<div style="text-align: justify; margin-left: auto; margin-right: auto; padding: 50px;background-color: white;">
If you use our dataset or method in your research, please cite our paper:
<pre>
@article{yue25finecvr,
title = {Learning Fine-Grained Representations through Textual Token Disentanglement in Composed Video Retrieval},
author = {Yue Wu, Zhaobo Qi, Yiling Wu, Junshu Sun, Yaowei Wang, Shuhui Wang},
journal = {ICLR},
year = {2025}
}
</pre>
</div>
</div>
</div>
</section>
<!-- About End -->
<!-- Footer Start -->
<footer>
<div class="container text-center pt-5">
<p class="mb-0">Copyright &copy; FineCVR 2025</p>
</div>
</footer>
<!-- Footer End -->
<!-- Javascript Start -->
<script
src="https://cdnjs.cloudflare.com/ajax/libs/jquery/1.12.4/jquery.min.js"
integrity="sha512-jGsMH83oKe9asCpkOVkBnUrDDTp8wl+adkB2D+//JtlxO4SrLoJdhbOysIFQJloQFD+C4Fl1rMsQZF76JjV0eQ=="
crossorigin="anonymous"
></script>
<script
src="https://cdnjs.cloudflare.com/ajax/libs/bootstrap/5.3.3/js/bootstrap.bundle.min.js"
integrity="sha512-7Pi/otdlbbCR+LnW+F7PwFcSDJOuUJB3OxtEHbg4vSMvzvJjde4Po1v4BR9Gdc9aXNUNFVUY+SK51wWT8WF0Gg=="
crossorigin="anonymous"
referrerpolicy="no-referrer"
></script>
<script
src="https://cdnjs.cloudflare.com/ajax/libs/jquery-easing/1.3/jquery.easing.min.js"
integrity="sha512-ahmSZKApTDNd3gVuqL5TQ3MBTj8tL5p2tYV05Xxzcfu6/ecvt1A0j6tfudSGBVuteSoTRMqMljbfdU0g2eDNUA=="
crossorigin="anonymous"
></script>
<!--
* Start Bootstrap - Grayscale Bootstrap Theme (http://startbootstrap.com)
* Code licensed under the Apache License v2.0.
* For details, see http://www.apache.org/licenses/LICENSE-2.0.
-->
<script>
function toggleNavCollapse() {
50 < $(".navbar").offset().top
? $(".fixed-top").addClass("top-nav-collapse")
: $(".fixed-top").removeClass("top-nav-collapse");
}
$(document).ready(toggleNavCollapse);
$(window).scroll(toggleNavCollapse);
$(function () {
$("a.page-scroll").bind("click", function (b) {
var a = $(this);
$("html, body")
.stop()
.animate(
{ scrollTop: $(a.attr("href")).offset().top - 50 },
100,
"easeInOutExpo",
function () {
a.blur();
},
);
b.preventDefault();
});
});
$(".navbar-collapse ul li a").click(function () {
$(".navbar-toggler:visible").click();
});
</script>
<!-- Collapse navbar when navbar-brand is clicked -->
<script>
document
.querySelector(".navbar-brand")
.addEventListener("click", function () {
const navbarCollapse = document.getElementById("navbarNav");
if (navbarCollapse.classList.contains("show")) {
const collapseInstance =
bootstrap.Collapse.getInstance(navbarCollapse) ||
new bootstrap.Collapse(navbarCollapse);
collapseInstance.hide();
}
});
</script>
<!-- Collapse navbar when clicked outside of it -->
<script>
document.addEventListener("click", function (event) {
const navbar = document.querySelector(".navbar-collapse");
const isClickInsideNavbar = navbar.contains(event.target);
const isNavbarOpen = navbar.classList.contains("show");
if (!isClickInsideNavbar && isNavbarOpen) {
const collapseInstance =
bootstrap.Collapse.getInstance(navbar) ||
new bootstrap.Collapse(navbar);
collapseInstance.hide();
}
});
</script>
<!-- Dynamic Typing Start -->
<script
type="text/javascript"
src="https://cdnjs.cloudflare.com/ajax/libs/typed.js/1.1.7/typed.min.js"
></script>
<script type="text/javascript">
$(".intro-text").typed({
strings: ["Learning Fine-Grained Representations through <br >Textual Token Disentanglement in Composed Video Retrieval"],
typeSpeed: 3,
startDelay: 1000,
loop: false,
loopCount: 1,
cursorChar: "|",
showCursor: false,
});
</script>
<!-- Dynamic Typing End -->
<!-- Comments Counter Start -->
<script type="text/javascript">
var disqus_shortname = "personal-jekyll-theme";
(function () {
var s = document.createElement("script");
s.async = true;
s.type = "text/javascript";
s.src = "//" + disqus_shortname + ".disqus.com/count.js";
(
document.getElementsByTagName("HEAD")[0] ||
document.getElementsByTagName("BODY")[0]
).appendChild(s);
})();
</script>
<!-- Comments Counter End -->
<!-- Javascript End -->
</body>
</html>
<!-- Index Layout End -->