Spaces:
Build error
Build error
update gradio
Browse files- app.py +26 -18
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -63,7 +63,7 @@ async def speech_to_text(video_file_path):
|
|
| 63 |
Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
|
| 64 |
"""
|
| 65 |
global total_inferences_since_reboot
|
| 66 |
-
if(video_file_path == None):
|
| 67 |
raise ValueError("Error no video input")
|
| 68 |
|
| 69 |
video_path = Path(video_file_path)
|
|
@@ -84,6 +84,7 @@ async def speech_to_text(video_file_path):
|
|
| 84 |
print(f'Transcribing from API attempt {tries}')
|
| 85 |
try:
|
| 86 |
inference_reponse = await query_api(audio_memory)
|
|
|
|
| 87 |
transcription = inference_reponse["text"].lower()
|
| 88 |
timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
|
| 89 |
for chunk in inference_reponse['chunks']]
|
|
@@ -92,7 +93,8 @@ async def speech_to_text(video_file_path):
|
|
| 92 |
print("\n\ntotal_inferences_since_reboot: ",
|
| 93 |
total_inferences_since_reboot, "\n\n")
|
| 94 |
return (transcription, transcription, timestamps)
|
| 95 |
-
except:
|
|
|
|
| 96 |
if 'error' in inference_reponse and 'estimated_time' in inference_reponse:
|
| 97 |
wait_time = inference_reponse['estimated_time']
|
| 98 |
print("Waiting for model to load....", wait_time)
|
|
@@ -134,7 +136,7 @@ async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
|
|
| 134 |
|
| 135 |
video_path = Path(video_in)
|
| 136 |
video_file_name = video_path.stem
|
| 137 |
-
if(video_in == None or text_in == None or transcription == None):
|
| 138 |
raise ValueError("Inputs undefined")
|
| 139 |
|
| 140 |
d = Differ()
|
|
@@ -150,7 +152,7 @@ async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
|
|
| 150 |
# groupping character timestamps so there are less cuts
|
| 151 |
idx = 0
|
| 152 |
grouped = {}
|
| 153 |
-
for(a, b) in zip(filtered, timestamps):
|
| 154 |
if a[0] != '-':
|
| 155 |
if idx in grouped:
|
| 156 |
grouped[idx].append(b)
|
|
@@ -203,7 +205,15 @@ async def query_api(audio_bytes: bytes):
|
|
| 203 |
}).encode("utf-8")
|
| 204 |
async with aiohttp.ClientSession() as session:
|
| 205 |
async with session.post(API_URL, headers=headers, data=payload) as response:
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
|
| 209 |
def ping(name):
|
|
@@ -222,28 +232,26 @@ video_in = gr.Video(label="Video file")
|
|
| 222 |
text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
|
| 223 |
video_out = gr.Video(label="Video Out")
|
| 224 |
diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True)
|
| 225 |
-
examples = gr.
|
| 226 |
-
components=[video_in], samples=VIDEOS, type="index")
|
| 227 |
|
| 228 |
-
|
| 229 |
#cut_btn, #reset_btn { align-self:stretch; }
|
| 230 |
#\\31 3 { max-width: 540px; }
|
| 231 |
.output-markdown {max-width: 65ch !important;}
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
with demo:
|
| 235 |
transcription_var = gr.Variable()
|
| 236 |
timestamps_var = gr.Variable()
|
| 237 |
with gr.Row():
|
| 238 |
with gr.Column():
|
| 239 |
-
gr.Markdown(
|
| 240 |
# Edit Video By Editing Text
|
| 241 |
This project is a quick proof of concept of a simple video editor where the edits
|
| 242 |
are made by editing the audio transcription.
|
| 243 |
Using the [Huggingface Automatic Speech Recognition Pipeline](https://huggingface.co/tasks/automatic-speech-recognition)
|
| 244 |
with a fine tuned [Wav2Vec2 model using Connectionist Temporal Classification (CTC)](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self)
|
| 245 |
you can predict not only the text transcription but also the [character or word base timestamps](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__.return_timestamps)
|
| 246 |
-
|
| 247 |
|
| 248 |
with gr.Row():
|
| 249 |
|
|
@@ -269,9 +277,9 @@ with demo:
|
|
| 269 |
text_in, transcription_var, timestamps_var])
|
| 270 |
|
| 271 |
with gr.Row():
|
| 272 |
-
gr.Markdown(
|
| 273 |
### Now edit as text
|
| 274 |
-
After running the video transcription, you can make cuts to the text below (only cuts, not additions!)
|
| 275 |
|
| 276 |
with gr.Row():
|
| 277 |
with gr.Column():
|
|
@@ -290,13 +298,13 @@ with demo:
|
|
| 290 |
video_out.render()
|
| 291 |
diff_out.render()
|
| 292 |
with gr.Row():
|
| 293 |
-
gr.Markdown(
|
| 294 |
#### Video Credits
|
| 295 |
|
| 296 |
1. [Cooking](https://vimeo.com/573792389)
|
| 297 |
1. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
|
| 298 |
1. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
|
| 299 |
-
|
| 300 |
-
|
| 301 |
if __name__ == "__main__":
|
| 302 |
demo.launch(debug=True)
|
|
|
|
| 63 |
Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
|
| 64 |
"""
|
| 65 |
global total_inferences_since_reboot
|
| 66 |
+
if (video_file_path == None):
|
| 67 |
raise ValueError("Error no video input")
|
| 68 |
|
| 69 |
video_path = Path(video_file_path)
|
|
|
|
| 84 |
print(f'Transcribing from API attempt {tries}')
|
| 85 |
try:
|
| 86 |
inference_reponse = await query_api(audio_memory)
|
| 87 |
+
print(inference_reponse)
|
| 88 |
transcription = inference_reponse["text"].lower()
|
| 89 |
timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
|
| 90 |
for chunk in inference_reponse['chunks']]
|
|
|
|
| 93 |
print("\n\ntotal_inferences_since_reboot: ",
|
| 94 |
total_inferences_since_reboot, "\n\n")
|
| 95 |
return (transcription, transcription, timestamps)
|
| 96 |
+
except Exception as e:
|
| 97 |
+
print(e)
|
| 98 |
if 'error' in inference_reponse and 'estimated_time' in inference_reponse:
|
| 99 |
wait_time = inference_reponse['estimated_time']
|
| 100 |
print("Waiting for model to load....", wait_time)
|
|
|
|
| 136 |
|
| 137 |
video_path = Path(video_in)
|
| 138 |
video_file_name = video_path.stem
|
| 139 |
+
if (video_in == None or text_in == None or transcription == None):
|
| 140 |
raise ValueError("Inputs undefined")
|
| 141 |
|
| 142 |
d = Differ()
|
|
|
|
| 152 |
# groupping character timestamps so there are less cuts
|
| 153 |
idx = 0
|
| 154 |
grouped = {}
|
| 155 |
+
for (a, b) in zip(filtered, timestamps):
|
| 156 |
if a[0] != '-':
|
| 157 |
if idx in grouped:
|
| 158 |
grouped[idx].append(b)
|
|
|
|
| 205 |
}).encode("utf-8")
|
| 206 |
async with aiohttp.ClientSession() as session:
|
| 207 |
async with session.post(API_URL, headers=headers, data=payload) as response:
|
| 208 |
+
print("API Response: ", response.status)
|
| 209 |
+
if response.headers['Content-Type'] == 'application/json':
|
| 210 |
+
return await response.json()
|
| 211 |
+
elif response.headers['Content-Type'] == 'application/octet-stream':
|
| 212 |
+
return await response.read()
|
| 213 |
+
elif response.headers['Content-Type'] == 'text/plain':
|
| 214 |
+
return await response.text()
|
| 215 |
+
else:
|
| 216 |
+
raise RuntimeError("Error Fetching API")
|
| 217 |
|
| 218 |
|
| 219 |
def ping(name):
|
|
|
|
| 232 |
text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
|
| 233 |
video_out = gr.Video(label="Video Out")
|
| 234 |
diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True)
|
| 235 |
+
examples = gr.Dataset(components=[video_in], samples=VIDEOS, type="index")
|
|
|
|
| 236 |
|
| 237 |
+
css = """
|
| 238 |
#cut_btn, #reset_btn { align-self:stretch; }
|
| 239 |
#\\31 3 { max-width: 540px; }
|
| 240 |
.output-markdown {max-width: 65ch !important;}
|
| 241 |
+
"""
|
| 242 |
+
with gr.Blocks(css=css) as demo:
|
|
|
|
| 243 |
transcription_var = gr.Variable()
|
| 244 |
timestamps_var = gr.Variable()
|
| 245 |
with gr.Row():
|
| 246 |
with gr.Column():
|
| 247 |
+
gr.Markdown("""
|
| 248 |
# Edit Video By Editing Text
|
| 249 |
This project is a quick proof of concept of a simple video editor where the edits
|
| 250 |
are made by editing the audio transcription.
|
| 251 |
Using the [Huggingface Automatic Speech Recognition Pipeline](https://huggingface.co/tasks/automatic-speech-recognition)
|
| 252 |
with a fine tuned [Wav2Vec2 model using Connectionist Temporal Classification (CTC)](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self)
|
| 253 |
you can predict not only the text transcription but also the [character or word base timestamps](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__.return_timestamps)
|
| 254 |
+
""")
|
| 255 |
|
| 256 |
with gr.Row():
|
| 257 |
|
|
|
|
| 277 |
text_in, transcription_var, timestamps_var])
|
| 278 |
|
| 279 |
with gr.Row():
|
| 280 |
+
gr.Markdown("""
|
| 281 |
### Now edit as text
|
| 282 |
+
After running the video transcription, you can make cuts to the text below (only cuts, not additions!)""")
|
| 283 |
|
| 284 |
with gr.Row():
|
| 285 |
with gr.Column():
|
|
|
|
| 298 |
video_out.render()
|
| 299 |
diff_out.render()
|
| 300 |
with gr.Row():
|
| 301 |
+
gr.Markdown("""
|
| 302 |
#### Video Credits
|
| 303 |
|
| 304 |
1. [Cooking](https://vimeo.com/573792389)
|
| 305 |
1. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
|
| 306 |
1. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
|
| 307 |
+
""")
|
| 308 |
+
demo.queue()
|
| 309 |
if __name__ == "__main__":
|
| 310 |
demo.launch(debug=True)
|
requirements.txt
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
torch
|
| 2 |
transformers
|
| 3 |
-
gradio==3.
|
| 4 |
datasets
|
| 5 |
librosa
|
| 6 |
ffmpeg-python
|
|
|
|
| 1 |
torch
|
| 2 |
transformers
|
| 3 |
+
gradio==3.35.2
|
| 4 |
datasets
|
| 5 |
librosa
|
| 6 |
ffmpeg-python
|