## Imports
from __future__ import unicode_literals
import cv2
import pandas as pd
from moviepy import VideoFileClip
from moviepy.video.tools.subtitles import SubtitlesClip
import os
import spaces
from moviepy.video.tools.subtitles import SubtitlesClip
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy import CompositeVideoClip
from moviepy import TextClip
import nemo.collections.asr as nemo_asr
import gradio as gr
#!/usr/bin/env python3
import csv, re, sys
from pathlib import Path
import urllib.request
urllib.request.urlretrieve("https://github.com/Jameshskelton/fonts/raw/refs/heads/main/P052-Roman.ttf", "P052-Roman.ttf")


def parse_time_to_srt(t):
    s = str(t).strip()
    if re.fullmatch(r"\d+(\.\d+)?", s):
        total_ms = int(round(float(s) * 1000))
    else:
        parts = s.split(':')
        if len(parts) == 2:
            mm, ss = parts
            sec = float(ss)
            total_ms = int(round((int(mm) * 60 + sec) * 1000))
        elif len(parts) == 3:
            hh, mm, ss = parts
            sec = float(ss)
            total_ms = int(round(((int(hh) * 3600) + (int(mm) * 60) + sec) * 1000))
        else:
            raise ValueError(f"Unrecognized time format: {s}")
    hours = total_ms // 3_600_000
    rem = total_ms % 3_600_000
    minutes = rem // 60_000
    rem = rem % 60_000
    seconds = rem // 1000
    millis = rem % 1000
    return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"

def map_position_to_tag(pos):
    if not pos:
        return ""
    s = str(pos).strip().lower()
    m = re.search(r"\\?an([1-9])", s)
    if m:
        return "{\\an" + m.group(1) + "}"
    if "top left" in s or ("top" in s and "left" in s):
        return "{\\an7}"
    if "top right" in s or ("top" in s and "right" in s):
        return "{\\an9}"
    if "bottom left" in s or ("bottom" in s and "left" in s):
        return "{\\an1}"
    if "bottom right" in s or ("bottom" in s and "right" in s):
        return "{\\an3}"
    if "top" in s:
        return "{\\an8}"
    if "middle" in s or "center" in s or "centre" in s:
        return "{\\an5}"
    if "bottom" in s:
        return "{\\an2}"
    return ""

def looks_like_header(row):
    joined = ",".join(c.strip().lower() for c in row[:4])
    header_words = ["position", "pos", "align", "start", "begin", "end", "stop", "subtitle", "text", "caption"]
    return any(w in joined for w in header_words)

def csv_to_srt(csv_path: Path, srt_path: Path):
    with open(f'{csv_path}',"r", encoding="utf-8-sig", newline="") as f:
        reader = csv.reader(f)
        rows = [row for row in reader if any(cell.strip() for cell in row)]
    if not rows:
        raise ValueError("CSV is empty.")
    start_index = 1 if looks_like_header(rows[0]) else 0
    normalized = []
    for i, row in enumerate(rows[start_index:], start=start_index+1):
        if len(row) < 4:
            raise ValueError(f"Row {i} has fewer than 4 columns: {row}")
        position, start, end, text = row[0].strip(), row[1].strip(), row[2].strip(), row[3]
        normalized.append((position, start, end, text))
    with open(f"{srt_path}", "w", encoding="utf-8") as out:
        for idx, (position, start, end, text) in enumerate(normalized, start=1):
            start_srt = parse_time_to_srt(start)
            end_srt = parse_time_to_srt(end)
            pos_tag = map_position_to_tag(position)
            final_text = f"{pos_tag}{text}" if pos_tag else text
            out.write(f"{idx}\n")
            out.write(f"{start_srt} --> {end_srt}\n")
            out.write(f"{final_text}\n\n")

from pydub import AudioSegment

def convert_audio_to_mono_16khz(input_path, output_path):
    """
    Converts an audio file to mono and resamples it to 16 kHz.

    Args:
        input_path (str): The path to the input audio file.
        output_path (str): The path to save the converted audio file.
    """
    try:
        audio = AudioSegment.from_file(input_path)
        # Set channels to 1 (mono)
        audio = audio.set_channels(1)
        # Set frame rate (sample rate) to 16000 Hz
        audio = audio.set_frame_rate(16000)
        audio.export(output_path, format="wav") # Export as WAV or desired format
        print(f"Audio converted successfully to mono, 16kHz at: {output_path}")
    except Exception as e:
        print(f"Error converting audio: {e}")

asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v3")

@spaces.GPU
def subtitle_video(input_file):
# ------------------------------------------------------------------------------------------------------------------------------
#     Params:
# ------------------------------------------------------------------------------------------------------------------------------
#.    name:          str, name of directory to store files in in experiments folder
#     audio_file:    str, path to extracted audio file for Whisper
#     input_file:    str, path to video file for MoviePy to caption
#     output:        str, destination of final output video file
#     lang:          str, language
#     uploaded_vid:  str, path to uploaded video file if download is False
#     
#--------------------------------------------------------------------------------------------------------------------------------
#     Returns:       An annotated video with translated captions into english, saved to name/output
#--------------------------------------------------------------------------------------------------------------------------------
    
    ## First, this checks if your expermiment name is taken. If not, it will create the directory.
    ## Otherwise, we will be prompted to retry with a new name
    name = 'run'
    try:
        os.mkdir(f'experiments/{name}')
        print('Starting AutoCaptioning...')
        print(f'Results will be stored in experiments/{name}')
        
    except:
        None
    
    # Use local clip if not downloading from youtube
    my_clip = VideoFileClip(input_file)
    my_clip.write_videofile(f"experiments/{name}/{input_file.split('/')[-1]}")
    my_clip.audio.write_audiofile(f'experiments/{name}/audio_file.wav', codec="mp3")

    # Instantiate parakeet model 
    model = asr_model

    # convert to format parakeet can interpret
    convert_audio_to_mono_16khz(f'experiments/{name}/audio_file.wav', f'experiments/{name}/audio_file.wav')

    # transcribe audio
    output = model.transcribe([f'experiments/{name}/audio_file.wav'], timestamps=True)

    # Convert audio to text with timestamps, dump into dataframe
    df = pd.DataFrame(output[0].timestamp['segment'])
    df['text'] = df['segment']
    df = df.drop(['start_offset', 'end_offset', 'segment'],axis = 1)

    # save csv and srt files
    df.to_csv(f'experiments/{name}/subs.csv')
    csv_to_srt(f"experiments/{name}/subs.csv",f"experiments/{name}/subs.srt")

    # Capture video
    vidcap = cv2.VideoCapture(f'''experiments/{name}/{input_file}''')
    success, image = vidcap.read()
    
    # Instantiate MoviePy subtitle generator with TextClip, subtitles, and SubtitlesClip
    generator = lambda txt: TextClip(
    "./P052-Roman.ttf",
    text = txt,
    font_size = int(my_clip.w/50),
    stroke_width=1,
    color= "white",
    stroke_color="black",
    size = (my_clip.w, my_clip.h),
    vertical_align = 'bottom',
    horizontal_align = 'center',
    method='caption')

    subs = SubtitlesClip(f"experiments/{name}/subs.srt", make_textclip=generator)

    video = VideoFileClip(input_file)
    final = CompositeVideoClip([video, subs])
    final.write_videofile(f'experiments/{name}/output.mp4', fps=video.fps, remove_temp=True, codec="libx264", audio_codec="aac")
    return f'experiments/{name}/output.mp4', df, f"experiments/{name}/subs.srt"

with gr.Blocks() as demo:
    gr.Markdown("<div style='display:flex;justify-content:center;align-items:center;gap:.5rem;font-size:24px;'>🦜 <strong>Parakeet AutoCaption Web App</strong></div>")
    with gr.Column():
        input_video = gr.Video(label = 'Input your video for captioning')
        # input_name = gr.Textbox(label = 'Name of your experiment run')
    with gr.Column():
        run_button = gr.Button('Run Video Captioning')
    with gr.Column():
        output_video = gr.Video(label = 'Output Video')
        output_subs = gr.Dataframe(label = 'Output Subtitles')
        output_subs_srt_file = gr.DownloadButton(label = 'Download subtitles as SRT file')
    gr.on(
        triggers=[run_button.click],
        fn=subtitle_video,
        inputs=[
            input_video,
        ],
        outputs=[output_video, output_subs, output_subs_srt_file],
    )
if __name__ == "__main__":
    demo.launch(share=True)