Spaces:
Running
Running
laurent
commited on
Commit
·
fe98679
1
Parent(s):
874d788
Add some content.
Browse files- README.md +3 -4
- helper.js +209 -0
- index.html +249 -17
README.md
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
---
|
| 2 |
title: Hibiki Samples
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: green
|
| 5 |
-
colorTo:
|
| 6 |
sdk: static
|
|
|
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
-
|
| 10 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
title: Hibiki Samples
|
| 3 |
+
emoji: 🤗
|
| 4 |
colorFrom: green
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: static
|
| 7 |
+
app_file: index.html
|
| 8 |
pinned: false
|
| 9 |
---
|
|
|
|
|
|
helper.js
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
function createAudioHTML(path) {
|
| 2 |
+
return '<audio controls controlslist="nodownload" class="px-1"> <source src=' +
|
| 3 |
+
path +
|
| 4 |
+
' type="audio/wav">Your browser does not support the audio element.</audio>';
|
| 5 |
+
}
|
| 6 |
+
|
| 7 |
+
function generateExampleRow(table_row, base_dir, dirs, filename, col_offset) {
|
| 8 |
+
for (var i = 0; i < dirs.length; i++) {
|
| 9 |
+
let cell = table_row.cells[col_offset + i];
|
| 10 |
+
let p = base_dir + '/' + dirs[i] + '/' + filename;
|
| 11 |
+
if (p.endsWith('txt')) {
|
| 12 |
+
var req = new XMLHttpRequest();
|
| 13 |
+
req.onreadystatechange = function() {
|
| 14 |
+
if (this.readyState === this.DONE) {
|
| 15 |
+
cell.innerHTML = '<font size="-1">' + req.responseText + '</font>';
|
| 16 |
+
}
|
| 17 |
+
};
|
| 18 |
+
req.open('GET', p);
|
| 19 |
+
req.send(null);
|
| 20 |
+
} else {
|
| 21 |
+
cell.innerHTML = cell.innerHTML + createAudioHTML(p);
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
function generateCVSS(tableId) {
|
| 28 |
+
let table = document.getElementById(tableId);
|
| 29 |
+
let base_dir = 'data/cvss_c_test'
|
| 30 |
+
let dirs = ['source', 'hibiki', 'seamless'];
|
| 31 |
+
let filenames = [
|
| 32 |
+
"cvss-fr2en-test-idx14345-20007437.wav",
|
| 33 |
+
"cvss-fr2en-test-idx14410-20011543.wav",
|
| 34 |
+
"cvss-fr2en-test-idx14603-20030929.wav",
|
| 35 |
+
"cvss-fr2en-test-idx14695-20041791.wav",
|
| 36 |
+
"cvss-fr2en-test-idx4562-19004869.wav",
|
| 37 |
+
];
|
| 38 |
+
|
| 39 |
+
for (var i = 0; i < filenames.length; i++) {
|
| 40 |
+
generateExampleRow(table.rows[1 + i], base_dir, dirs, filenames[i], 0);
|
| 41 |
+
}
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
function generateNTREX(tableId) {
|
| 45 |
+
let table = document.getElementById(tableId);
|
| 46 |
+
let base_dir = 'data/audio_ntrex_long'
|
| 47 |
+
let dirs = ['source', 'hibiki', 'seamless'];
|
| 48 |
+
let filenames = [
|
| 49 |
+
"10887_ea80c8e6-883d-4afe-841b-598ce7db3779.wav",
|
| 50 |
+
"3120_a63eabfc-d5aa-4353-84d0-9c5c068a1b38.wav",
|
| 51 |
+
"5196_ea80c8e6-883d-4afe-841b-598ce7db3779.wav",
|
| 52 |
+
"6855_f3c3ea82-42ef-4c09-b4aa-544a4c95518b.wav",
|
| 53 |
+
"9605_83f1360e-7775-4d36-89f6-60649041c935.wav"
|
| 54 |
+
];
|
| 55 |
+
|
| 56 |
+
for (var i = 0; i < filenames.length; i++) {
|
| 57 |
+
generateExampleRow(table.rows[1 + i], base_dir, dirs, filenames[i], 0);
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
function generateVoxPopuli(tableId) {
|
| 62 |
+
let table = document.getElementById(tableId);
|
| 63 |
+
let base_dir = 'data/voxpopuli'
|
| 64 |
+
let dirs = ['source', 'hibiki_cfg=1', 'hibiki_cfg=3', 'hibiki_cfg=10', 'seamless'];
|
| 65 |
+
let filenames = [
|
| 66 |
+
"20090422-0900-PLENARY-3_20090422-09:53:50_7.wav",
|
| 67 |
+
"20090506-0900-PLENARY-12_20090506-17:43:49_4.wav",
|
| 68 |
+
"20090914-0900-PLENARY-15_20090914-20:43:54_7.wav",
|
| 69 |
+
"20090916-0900-PLENARY-4_20090916-10:55:02_12.wav",
|
| 70 |
+
];
|
| 71 |
+
|
| 72 |
+
for (var i = 0; i < filenames.length; i++) {
|
| 73 |
+
generateExampleRow(table.rows[1 + i], base_dir, dirs, filenames[i], 0);
|
| 74 |
+
}
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
generateNTREX('ntrex-table');
|
| 79 |
+
generateCVSS('cvss-table');
|
| 80 |
+
generateVoxPopuli('voxpopuli-table');
|
| 81 |
+
|
| 82 |
+
// Borrowed from https://nu-dialogue.github.io/j-moshi/
|
| 83 |
+
$(document).ready(function() {
|
| 84 |
+
{
|
| 85 |
+
const columns = ['Hibiki', 'Seamless'];
|
| 86 |
+
const rows = [
|
| 87 |
+
['data-stereo/hibiki1.wav', 'data-stereo/seamless1.wav'],
|
| 88 |
+
['data-stereo/hibiki2.wav', 'data-stereo/seamless2.wav'],
|
| 89 |
+
['data-stereo/hibiki3.wav', 'data-stereo/seamless3.wav'],
|
| 90 |
+
];
|
| 91 |
+
const table = $('#vis-table');
|
| 92 |
+
|
| 93 |
+
// Add header
|
| 94 |
+
const thead = $('<thead>');
|
| 95 |
+
const headerRow = $('<tr>');
|
| 96 |
+
columns.forEach(header => {
|
| 97 |
+
headerRow.append($('<th style="text-align: center">').text(header));
|
| 98 |
+
});
|
| 99 |
+
thead.append(headerRow);
|
| 100 |
+
table.append(thead);
|
| 101 |
+
|
| 102 |
+
// Add rows
|
| 103 |
+
const tbody = $('<tbody>');
|
| 104 |
+
rows.forEach((files, i) => {
|
| 105 |
+
const row = $('<tr>');
|
| 106 |
+
files.forEach((files, j) => {
|
| 107 |
+
// Add waveform cell
|
| 108 |
+
const waveCell = $('<td style="text-align: center">');//.css('min-width', '200px');
|
| 109 |
+
const waveform = $('<div>').attr('id', `waveform-${i}-${j}`);
|
| 110 |
+
waveCell.append(waveform);
|
| 111 |
+
const playPauseButton = `
|
| 112 |
+
<button class="btn btn-secondary" data-action="play" id="play-pause-${i}-${j}">
|
| 113 |
+
<i class="bi bi-play-fill"></i> Play / <i class="bi bi-pause-fill"></i> Pause
|
| 114 |
+
</button>
|
| 115 |
+
`;
|
| 116 |
+
waveCell.append(playPauseButton);
|
| 117 |
+
row.append(waveCell);
|
| 118 |
+
});
|
| 119 |
+
tbody.append(row);
|
| 120 |
+
});
|
| 121 |
+
table.append(tbody);
|
| 122 |
+
|
| 123 |
+
// Create wavesurfer instances
|
| 124 |
+
rows.forEach((files, i) => {
|
| 125 |
+
files.forEach((file, j) => {
|
| 126 |
+
const wavesurfer = WaveSurfer.create({
|
| 127 |
+
container: `#waveform-${i}-${j}`,
|
| 128 |
+
url: file,
|
| 129 |
+
splitChannels: [
|
| 130 |
+
{
|
| 131 |
+
waveColor: '#2E7D9E',
|
| 132 |
+
progressColor: '#173E4E',
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
waveColor: '#E57872',
|
| 136 |
+
progressColor: '#2A0908',
|
| 137 |
+
}
|
| 138 |
+
],
|
| 139 |
+
barWidth: 2,
|
| 140 |
+
height: 55,
|
| 141 |
+
width: 700,
|
| 142 |
+
});
|
| 143 |
+
$(`#play-pause-${i}-${j}`).click(() => {
|
| 144 |
+
wavesurfer.playPause();
|
| 145 |
+
});
|
| 146 |
+
});
|
| 147 |
+
});
|
| 148 |
+
}
|
| 149 |
+
{
|
| 150 |
+
const columns = ['Real Human Interpretation', 'Hibiki', 'Seamless'];
|
| 151 |
+
const dirs = [
|
| 152 |
+
"data/voxpopuli/gt_with_fr_background",
|
| 153 |
+
"data/voxpopuli/hibiki_cfg=3_with_fr_background",
|
| 154 |
+
"data/voxpopuli/seamless_with_fr_background",
|
| 155 |
+
];
|
| 156 |
+
const rows = [
|
| 157 |
+
"20090422-0900-PLENARY-3_20090422-09:53:50_7.wav",
|
| 158 |
+
"20090506-0900-PLENARY-12_20090506-17:43:49_4.wav",
|
| 159 |
+
"20090914-0900-PLENARY-15_20090914-20:43:54_7.wav",
|
| 160 |
+
"20090916-0900-PLENARY-4_20090916-10:55:02_12.wav",
|
| 161 |
+
];
|
| 162 |
+
const table = $('#vis-table2');
|
| 163 |
+
|
| 164 |
+
// Add header
|
| 165 |
+
const thead = $('<thead>');
|
| 166 |
+
const headerRow = $('<tr>');
|
| 167 |
+
columns.forEach(header => {
|
| 168 |
+
headerRow.append($('<th style="text-align: center">').text(header));
|
| 169 |
+
});
|
| 170 |
+
thead.append(headerRow);
|
| 171 |
+
table.append(thead);
|
| 172 |
+
|
| 173 |
+
// Add rows
|
| 174 |
+
const tbody = $('<tbody>');
|
| 175 |
+
rows.forEach((file, i) => {
|
| 176 |
+
const row = $('<tr>');
|
| 177 |
+
dirs.forEach((d, j) => {
|
| 178 |
+
// Add waveform cell
|
| 179 |
+
const waveCell = $('<td style="text-align: center">');//.css('min-width', '200px');
|
| 180 |
+
const waveform = $('<div>').attr('id', `waveform2-${i}-${j}`);
|
| 181 |
+
waveCell.append(waveform);
|
| 182 |
+
const playPauseButton = `
|
| 183 |
+
<button class="btn btn-secondary" data-action="play" id="play-pause-${i}-${j}">
|
| 184 |
+
<i class="bi bi-play-fill"></i> Play / <i class="bi bi-pause-fill"></i> Pause
|
| 185 |
+
</button>
|
| 186 |
+
`;
|
| 187 |
+
waveCell.append(playPauseButton);
|
| 188 |
+
row.append(waveCell);
|
| 189 |
+
});
|
| 190 |
+
tbody.append(row);
|
| 191 |
+
});
|
| 192 |
+
table.append(tbody);
|
| 193 |
+
|
| 194 |
+
// Create wavesurfer instances
|
| 195 |
+
rows.forEach((file, i) => {
|
| 196 |
+
dirs.forEach((dir, j) => {
|
| 197 |
+
const wavesurfer = WaveSurfer.create({
|
| 198 |
+
container: `#waveform2-${i}-${j}`,
|
| 199 |
+
url: dir + '/' + file,
|
| 200 |
+
barWidth: 2,
|
| 201 |
+
height: 55,
|
| 202 |
+
});
|
| 203 |
+
$(`#play-pause-${i}-${j}`).click(() => {
|
| 204 |
+
wavesurfer.playPause();
|
| 205 |
+
});
|
| 206 |
+
});
|
| 207 |
+
});
|
| 208 |
+
}
|
| 209 |
+
});
|
index.html
CHANGED
|
@@ -1,19 +1,251 @@
|
|
| 1 |
-
<!
|
| 2 |
<html>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
</html>
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<title>Hibiki</title>
|
| 5 |
+
<link
|
| 6 |
+
href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css"
|
| 7 |
+
rel="stylesheet"
|
| 8 |
+
/>
|
| 9 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.min.css">
|
| 10 |
+
<meta charset="utf-8" />
|
| 11 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 12 |
+
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
|
| 13 |
+
<script src="https://unpkg.com/wavesurfer.js@7"></script>
|
| 14 |
+
<script src="helper.js" defer></script>
|
| 15 |
+
<script>
|
| 16 |
+
function _setup_callback(elem, elems) {
|
| 17 |
+
elem.addEventListener("play", function () {
|
| 18 |
+
for (other of elems) {
|
| 19 |
+
if (other !== elem) {
|
| 20 |
+
other.pause();
|
| 21 |
+
}
|
| 22 |
+
}
|
| 23 |
+
});
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
document.addEventListener('DOMContentLoaded', function () {
|
| 27 |
+
var elems = document.body.getElementsByTagName("audio");
|
| 28 |
+
for (elem of elems) {
|
| 29 |
+
_setup_callback(elem, elems);
|
| 30 |
+
}
|
| 31 |
+
});
|
| 32 |
+
</script>
|
| 33 |
+
<style>
|
| 34 |
+
td {
|
| 35 |
+
vertical-align: middle;
|
| 36 |
+
text-align: center;
|
| 37 |
+
}
|
| 38 |
+
audio {
|
| 39 |
+
width: 20vw;
|
| 40 |
+
min-width: 100px;
|
| 41 |
+
max-width: 100%;
|
| 42 |
+
}
|
| 43 |
+
h1, h2, h3, h4, h5, h6, body, b, strong, th {
|
| 44 |
+
color: #595959;
|
| 45 |
+
}
|
| 46 |
+
.ratio-8x5 {
|
| 47 |
+
--bs-aspect-ratio: 62.5%;
|
| 48 |
+
}
|
| 49 |
+
.btn-secondary {
|
| 50 |
+
padding: 0.1rem 0.8rem;
|
| 51 |
+
font-size: small
|
| 52 |
+
}
|
| 53 |
+
.container {
|
| 54 |
+
max-width: 1620px;
|
| 55 |
+
}
|
| 56 |
+
</style>
|
| 57 |
+
</head>
|
| 58 |
+
<body>
|
| 59 |
+
<div class="container pt-5 mt-5 shadow p-5 mb-5 bg-white rounded">
|
| 60 |
+
<div class="text-center">
|
| 61 |
+
<h1>High-Fidelity Simultaneous Speech-To-Speech Translation</h1>
|
| 62 |
+
<p class="lead">
|
| 63 |
+
<a href="https://kyutai.org">Kyutai</a>
|
| 64 |
+
- code on <a href="https://github.com/kyutai-labs/hibiki">github</a>
|
| 65 |
+
</p>
|
| 66 |
+
</div>
|
| 67 |
+
<p>
|
| 68 |
+
<b>Abstract.</b>
|
| 69 |
+
We introduce <i>Hibiki</i> ('echo' in Japanese)
|
| 70 |
+
Hibiki leverages a multistream language model to synchronously process
|
| 71 |
+
source and target speech, and jointly produces text and audio tokens to
|
| 72 |
+
perform speech-to-text and speech-to-speech translation.
|
| 73 |
+
We furthermore address the fundamental challenge of <i>simultaneous</i> interpretation,
|
| 74 |
+
which unlike its <i>consecutive</i> counterpart---where one waits for
|
| 75 |
+
the end of the source utterance to start translating--- adapts its flow
|
| 76 |
+
to accumulate just enough context to produce a correct translation in
|
| 77 |
+
real-time, chunk by chunk. <br />
|
| 78 |
+
To do so, we introduce a weakly-supervised method that leverages the
|
| 79 |
+
perplexity of an off-the-shelf text translation system to identify
|
| 80 |
+
optimal delays on a per-word basis and create aligned synthetic data.
|
| 81 |
+
After supervised training, Hibiki performs adaptive, simultaneous
|
| 82 |
+
speech translation with vanilla temperature sampling. On a
|
| 83 |
+
French-English simultaneous speech translation task, Hibiki demonstrates
|
| 84 |
+
state-of-the-art performance in translation quality, speaker fidelity
|
| 85 |
+
and naturalness. Moreover, the simplicity of its inference process
|
| 86 |
+
makes it compatible with batched translation and even real-time
|
| 87 |
+
on-device deployment.
|
| 88 |
+
</p>
|
| 89 |
+
</div>
|
| 90 |
+
|
| 91 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
| 92 |
+
<h3>In the Wild Examples<a id="vis"/></h3>
|
| 93 |
+
<p class="mb-0">
|
| 94 |
+
</p>
|
| 95 |
+
<div class="container pt-3 table-responsive">
|
| 96 |
+
<table class="table table-hover" width="100%">
|
| 97 |
+
<tr>
|
| 98 |
+
<td witdth="50%">
|
| 99 |
+
<video class="embed-responsive-item" style="max-width: 80%; min-width: 400px;" controls>
|
| 100 |
+
<source src="videos/RPckvIkNWhE_ss301_to390_babel_numerique_arte.mp4" type="video/mp4">
|
| 101 |
+
Your browser does not support HTML video.
|
| 102 |
+
</video>
|
| 103 |
+
</td>
|
| 104 |
+
<td width="50%">
|
| 105 |
+
<video class="embed-responsive-item" style="max-width: 80%; min-width: 400px;" controls>
|
| 106 |
+
<source src="videos/uNAmODXvAiQ_ss9_message_a_caractere_informatif.mp4" type="video/mp4">
|
| 107 |
+
Your browser does not support HTML video.
|
| 108 |
+
</video>
|
| 109 |
+
</td>
|
| 110 |
+
<tr>
|
| 111 |
+
<td>
|
| 112 |
+
This example comes from a video explaining automated translation.
|
| 113 |
+
(<a href="https://www.youtube.com/watch?v=RPckvIkNWhE" target="_blank">source</a>, original video (c) Arte)
|
| 114 |
+
</td>
|
| 115 |
+
<td>
|
| 116 |
+
This example comes from a humoristic video. The source voice is high pitch on purpose,
|
| 117 |
+
it is a good showcase of how well Hibiki replicates pitch and prosody and how robust it is to
|
| 118 |
+
background noise <b>as no denoising is applied to the audio which is fed raw to Hibiki</b>.
|
| 119 |
+
(<a href="https://www.youtube.com/watch?v=uNAmODXvAiQ" target="_blank">source</a>, original video (c) Canal+)
|
| 120 |
+
</td>
|
| 121 |
+
</tr>
|
| 122 |
+
</table>
|
| 123 |
+
</div>
|
| 124 |
+
</div>
|
| 125 |
+
|
| 126 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
| 127 |
+
<h3>Examples with Ground Truth Interpretation<a id="vis"/></h3>
|
| 128 |
+
<p class="mb-0">
|
| 129 |
+
These samples come from the VoxPopuli dataset where the ground truth is real human
|
| 130 |
+
interpretation.
|
| 131 |
+
The volume for the sources has been reduced so that it's easier to hear the translations.
|
| 132 |
+
</p>
|
| 133 |
+
<div class="container pt-3 table-responsive">
|
| 134 |
+
<table class="table table-hover" id="vis-table2"></table>
|
| 135 |
+
</div>
|
| 136 |
+
</div>
|
| 137 |
+
|
| 138 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
| 139 |
+
<h3>Multistream Visualization<a id="vis"/></h3>
|
| 140 |
+
<p class="mb-0">
|
| 141 |
+
The audio for the source and translated versions are on different channels. Use headphones
|
| 142 |
+
to hear both at the same time. These samples are the same as in the voxpopuli section with CFG
|
| 143 |
+
set to 3.
|
| 144 |
+
</p>
|
| 145 |
+
<div class="container pt-3 table-responsive">
|
| 146 |
+
<table class="table table-hover" id="vis-table"></table>
|
| 147 |
+
</div>
|
| 148 |
+
</div>
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
| 152 |
+
<h3>Impact of Classifier-Free Guidance<a id="voxpopuli"/></h3>
|
| 153 |
+
<p class="mb-0">
|
| 154 |
+
Samples taken from the VoxPopuli dataset. The Hibiki samples are presented with different levels
|
| 155 |
+
of classifier-free guidance (CFG). The higher the CFG value, the closer the generated voice will
|
| 156 |
+
be to the original voice. This results in very strong accents for the generations with the higher
|
| 157 |
+
values.
|
| 158 |
+
</p>
|
| 159 |
+
|
| 160 |
+
<div class="container pt-3 table-responsive">
|
| 161 |
+
<table
|
| 162 |
+
class="table table-hover"
|
| 163 |
+
id="voxpopuli-table"
|
| 164 |
+
>
|
| 165 |
+
<thead>
|
| 166 |
+
<tr>
|
| 167 |
+
<th style="text-align: center">Source</th>
|
| 168 |
+
<th style="text-align: center">Hibiki CFG-1</th>
|
| 169 |
+
<th style="text-align: center">Hibiki CFG-3</th>
|
| 170 |
+
<th style="text-align: center">Hibiki CFG-10</th>
|
| 171 |
+
<th style="text-align: center">Seamless</th>
|
| 172 |
+
</tr>
|
| 173 |
+
</thead>
|
| 174 |
+
<tbody>
|
| 175 |
+
<tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
|
| 176 |
+
<tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
|
| 177 |
+
<tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
|
| 178 |
+
<tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
|
| 179 |
+
<tr> <td></td> <td></td> <td></td> <td></td> <td></td></tr>
|
| 180 |
+
</tbody>
|
| 181 |
+
</table>
|
| 182 |
+
</div>
|
| 183 |
+
</div>
|
| 184 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
| 185 |
+
<h3>Long-form Simultaneous Translations<a id="ntrex"/></h3>
|
| 186 |
+
<p class="mb-0">
|
| 187 |
+
Samples taken from the audio NTREX dataset.
|
| 188 |
+
</p>
|
| 189 |
+
|
| 190 |
+
<div class="container pt-3 table-responsive">
|
| 191 |
+
<table
|
| 192 |
+
class="table table-hover"
|
| 193 |
+
id="ntrex-table"
|
| 194 |
+
>
|
| 195 |
+
<thead>
|
| 196 |
+
<tr>
|
| 197 |
+
<th style="text-align: center;min-width: 200px;">Source</th>
|
| 198 |
+
<th style="text-align: center;">Hibiki</th>
|
| 199 |
+
<th style="text-align: center">Seamless</th>
|
| 200 |
+
</tr>
|
| 201 |
+
</thead>
|
| 202 |
+
<tbody>
|
| 203 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
| 204 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
| 205 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
| 206 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
| 207 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
| 208 |
+
</tbody>
|
| 209 |
+
</table>
|
| 210 |
+
</div>
|
| 211 |
+
</div>
|
| 212 |
+
|
| 213 |
+
<div class="container shadow p-5 mb-5 bg-white rounded">
|
| 214 |
+
<h3>Short-form Simultaneous Translations<a id="cvss-c"/></h3>
|
| 215 |
+
<p class="mb-0">
|
| 216 |
+
Samples taken from the CVSS-C dataset.
|
| 217 |
+
</p>
|
| 218 |
+
|
| 219 |
+
<div class="container pt-3 table-responsive">
|
| 220 |
+
<table
|
| 221 |
+
class="table table-hover"
|
| 222 |
+
id="cvss-table"
|
| 223 |
+
>
|
| 224 |
+
<thead>
|
| 225 |
+
<tr>
|
| 226 |
+
<th style="text-align: center;min-width: 200px;">Source</th>
|
| 227 |
+
<th style="text-align: center;">Hibiki</th>
|
| 228 |
+
<th style="text-align: center">Seamless</th>
|
| 229 |
+
</tr>
|
| 230 |
+
</thead>
|
| 231 |
+
<tbody>
|
| 232 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
| 233 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
| 234 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
| 235 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
| 236 |
+
<tr> <td></td> <td></td> <td></td></tr>
|
| 237 |
+
</tbody>
|
| 238 |
+
</table>
|
| 239 |
+
</div>
|
| 240 |
+
</div>
|
| 241 |
+
|
| 242 |
+
<div class="container p-5 mb-5 bg-white rounded">
|
| 243 |
+
<p class="mb-0">
|
| 244 |
+
This page was adapted from the <a href="https://google-research.github.io/seanet/soundstorm/examples">SoundStorm project page</a>.
|
| 245 |
+
</p>
|
| 246 |
+
</div>
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
</body>
|
| 250 |
</html>
|
| 251 |
+
|