vikramvasudevan commited on
Commit
4aebf77
·
verified ·
1 Parent(s): a19a3df

Upload folder using huggingface_hub

Browse files
app.py CHANGED
@@ -23,16 +23,23 @@ from db import SanatanDatabase
23
  from drive_downloader import ZipDownloader
24
  from graph_helper import generate_graph
25
  from nalayiram_helper import delete_taniyan
 
26
 
27
  # Logging
28
  logging.basicConfig()
29
  logger = logging.getLogger()
30
- logger.setLevel(logging.INFO)
31
 
32
- graph = generate_graph()
 
 
 
 
 
33
 
34
- import pycountry
 
35
 
 
36
 
37
  def get_all_languages():
38
  """
@@ -89,7 +96,7 @@ message_textbox = gr.Textbox(
89
 
90
  with gr.Blocks(
91
  theme=gr.themes.Citrus(),
92
- title="Sanatan-AI",
93
  css="""
94
  /* hide the additional inputs row under the textbox */
95
  .gr-chat-interface .gr-form {
 
23
  from drive_downloader import ZipDownloader
24
  from graph_helper import generate_graph
25
  from nalayiram_helper import delete_taniyan
26
+ import pycountry
27
 
28
  # Logging
29
  logging.basicConfig()
30
  logger = logging.getLogger()
 
31
 
32
+ logger.setLevel(logging.INFO)
33
+ # Suppress OpenAI debug logs
34
+ logging.getLogger("openai").setLevel(logging.WARNING)
35
+ # Silence httpx + httpcore logs
36
+ logging.getLogger("httpx").setLevel(logging.WARNING)
37
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
38
 
39
+ # (Optional) Silence OpenAI logs too
40
+ logging.getLogger("openai").setLevel(logging.WARNING)
41
 
42
+ graph = generate_graph()
43
 
44
  def get_all_languages():
45
  """
 
96
 
97
  with gr.Blocks(
98
  theme=gr.themes.Citrus(),
99
+ title="Sanatan-AI | Chat",
100
  css="""
101
  /* hide the additional inputs row under the textbox */
102
  .gr-chat-interface .gr-form {
main.py CHANGED
@@ -4,6 +4,8 @@ from fastapi.responses import RedirectResponse
4
  import uvicorn
5
  from fastapi import FastAPI
6
  from modules.dropbox.audio import cleanup_audio_url_cache
 
 
7
  from server import router as mobile_router
8
  from app import gradio_app # your Blocks object
9
  import gradio as gr
@@ -18,12 +20,16 @@ app = FastAPI(title="Sanatan AI Unified Server")
18
  app.include_router(mobile_router, prefix="/api")
19
 
20
  # Convert Gradio Blocks to ASGI app
21
- app = gr.mount_gradio_app(app, gradio_app,"/web")
22
 
23
- # Redirect root URL to /web/
 
 
 
 
24
  @app.get("/")
25
  async def redirect_to_web():
26
- return RedirectResponse(url="/web/")
27
 
28
  @app.middleware("http")
29
  async def log_requests(request: Request, call_next):
@@ -40,4 +46,4 @@ async def lifespan(app: FastAPI):
40
  # Shutdown code (optional) can go here
41
 
42
  if __name__ == "__main__":
43
- uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=False)
 
4
  import uvicorn
5
  from fastapi import FastAPI
6
  from modules.dropbox.audio import cleanup_audio_url_cache
7
+ from modules.home.app import home_app
8
+ from modules.youtube_metadata.app import youtube_metadata_app
9
  from server import router as mobile_router
10
  from app import gradio_app # your Blocks object
11
  import gradio as gr
 
20
  app.include_router(mobile_router, prefix="/api")
21
 
22
  # Convert Gradio Blocks to ASGI app
23
+ app = gr.mount_gradio_app(app, gradio_app,"/sanatan_ai_web")
24
 
25
+ app = gr.mount_gradio_app(app, youtube_metadata_app,"/yt_web")
26
+
27
+ app = gr.mount_gradio_app(app, home_app,"/home")
28
+
29
+ # Redirect root URL to /home/
30
  @app.get("/")
31
  async def redirect_to_web():
32
+ return RedirectResponse(url="/home/")
33
 
34
  @app.middleware("http")
35
  async def log_requests(request: Request, call_next):
 
46
  # Shutdown code (optional) can go here
47
 
48
  if __name__ == "__main__":
49
+ uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=False, access_log=False)
modules/home/app.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ with gr.Blocks(title="Sanatana AI - Home") as home_app:
4
+ gr.Markdown("## Welcome to Sanatan AI!")
5
+ with gr.Row():
6
+ gr.Button("Go to Sanatan AI", link="/sanatan_ai_web") # link to /web
7
+ gr.Button("Manage Youtube Metadata", link="/yt_web") # link to /yt_web
modules/youtube_metadata/answerer.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -------------------------------
2
+ # 4. Answerer
3
+ # -------------------------------
4
+ from typing import List
5
+ from pydantic import BaseModel
6
+ from openai import OpenAI
7
+ from modules.youtube_metadata.retriever import retrieve_videos
8
+
9
+
10
+ # -------------------------------
11
+ # Structured Output Classes
12
+ # -------------------------------
13
+ class VideoItem(BaseModel):
14
+ video_id: str
15
+ title: str
16
+ channel: str
17
+ description: str
18
+
19
+
20
+ class LLMAnswer(BaseModel):
21
+ answer_text: str
22
+ top_videos: List[VideoItem]
23
+
24
+
25
+ # -------------------------------
26
+ # Main Function
27
+ # -------------------------------
28
+ def answer_query(
29
+ query: str, collection, top_k: int = 5, channel_id: str = None
30
+ ) -> LLMAnswer:
31
+ """
32
+ Answer a user query using YouTube video metadata.
33
+ Returns an LLMAnswer object with textual answer + list of videos.
34
+ """
35
+ results = retrieve_videos(query, collection, top_k=top_k, channel_id=channel_id)
36
+
37
+ if not results:
38
+ return LLMAnswer(answer_text="No relevant videos found.", top_videos=[])
39
+
40
+ # Build context lines for the LLM
41
+ context_lines = []
42
+ for r in results:
43
+ if not isinstance(r, dict):
44
+ continue
45
+ vid_id = r.get("video_id", "")
46
+ title = r.get("video_title") or r.get("title", "")
47
+ channel = r.get("channel") or r.get("channel_title", "")
48
+ description = r.get("description", "")
49
+ context_lines.append(
50
+ f"- {title} ({channel}) (https://youtube.com/watch?v={vid_id})\n description: {description}"
51
+ )
52
+
53
+ context_text = "\n".join(context_lines)
54
+
55
+ # Call LLM with structured output
56
+ client = OpenAI()
57
+ response = client.chat.completions.parse(
58
+ model="gpt-4o-mini",
59
+ messages=[
60
+ {
61
+ "role": "system",
62
+ "content": (
63
+ "You are a helpful assistant that answers questions using YouTube video metadata. "
64
+ "Return your response strictly as the LLMAnswer class, including 'answer_text' and a list of **only the most relevant** 'top_videos'.\n"
65
+ "- `answer_text` MUST be very short and concise in natural language (max 100 words).\n"
66
+ "- Use `top_videos` to include only the top 3 most relevant items from context.\n"
67
+ "- Do not include all items unless all are clearly relevant.\n"
68
+ "- Do not makeup `description`. Use the exact descriptions as given in the context"
69
+ ),
70
+ },
71
+ {
72
+ "role": "user",
73
+ "content": f"Question: {query}\n\nCandidate videos:\n{context_text}\n\nPick only the relevant ones.",
74
+ },
75
+ ],
76
+ response_format=LLMAnswer,
77
+ )
78
+
79
+ llm_answer = response.choices[0].message.parsed
80
+ answer_text = "\n## Answer : \n" + llm_answer.answer_text
81
+ video_html = build_video_html(llm_answer.top_videos)
82
+ return answer_text, video_html
83
+
84
+
85
+ def build_video_html(videos: list[VideoItem]) -> str:
86
+ """Build a clean HTML table from top_videos."""
87
+ if not videos:
88
+ return "<p>No relevant videos found.</p>"
89
+
90
+ html = """
91
+ <table border="1" style="border-collapse: collapse; width: 100%;">
92
+ <tr>
93
+ <th>Description</th>
94
+ <th>Watch</th>
95
+ </tr>
96
+ """
97
+ for v in videos:
98
+ embed_html = f"""
99
+ <div style="margin-bottom: 20px;">
100
+ <strong>{v.title}</strong> ({v.channel})<br>
101
+ <iframe width="360" height="203"
102
+ src="https://www.youtube.com/embed/{v.video_id}"
103
+ frameborder="0"
104
+ allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
105
+ allowfullscreen>
106
+ </iframe>
107
+ </div>
108
+ """
109
+ html += f"""
110
+ <tr>
111
+ <td>{v.description}</td>
112
+ <td>{embed_html}</td>
113
+ </tr>
114
+ """
115
+ html += "</table>"
116
+ return html
modules/youtube_metadata/app.py ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ import re
4
+ import threading
5
+ import gradio as gr
6
+ from gradio_modal import Modal
7
+ from modules.youtube_metadata.downloader import export_channel_json
8
+ from modules.youtube_metadata.channel_utils import fetch_channel_dataframe
9
+ from modules.youtube_metadata.db import (
10
+ delete_channel_from_collection,
11
+ get_collection,
12
+ get_indexed_channels,
13
+ )
14
+ from modules.youtube_metadata.answerer import answer_query
15
+ from dotenv import load_dotenv
16
+
17
+ from modules.youtube_metadata.youtube_poller import start_poll
18
+ from modules.youtube_metadata.youtube_sync import sync_channels_from_youtube
19
+
20
+ load_dotenv()
21
+
22
+
23
+ # -------------------------------
24
+ # Utility functions
25
+ # -------------------------------
26
+ def refresh_channel_list():
27
+ return gr.update(choices=list_channels_radio())
28
+
29
+
30
+ def show_component():
31
+ return gr.update(visible=True)
32
+
33
+
34
+ def hide_component():
35
+ return gr.update(visible=False)
36
+
37
+
38
+ def open_component():
39
+ return gr.update(open=True)
40
+
41
+
42
+ def close_component():
43
+ return gr.update(open=False)
44
+
45
+
46
+ def enable_component():
47
+ return gr.update(interactive=True)
48
+
49
+
50
+ def disable_component():
51
+ return gr.update(interactive=False)
52
+
53
+
54
+ def clear_component():
55
+ return gr.update(value="")
56
+
57
+
58
+ def show_loading(question):
59
+ return gr.update(value=f"⏳Fetching details on [{question}]...")
60
+
61
+
62
+ def enable_if_not_none(question):
63
+ if question is None:
64
+ return disable_component()
65
+ else:
66
+ return enable_component()
67
+
68
+
69
+ def index_channels(channel_urls: str):
70
+ yield "saving ...", gr.update(), gr.update()
71
+ yt_api_key = os.environ["YOUTUBE_API_KEY"]
72
+
73
+ urls = [u.strip() for u in re.split(r"[\n,]+", channel_urls) if u.strip()]
74
+ total_videos = 0
75
+
76
+ # sync all channels, streaming progress
77
+ for message, videos_count in sync_channels_from_youtube(yt_api_key, urls):
78
+ total_videos = videos_count # accumulate actual number of videos indexed
79
+ yield message, gr.update(), gr.update()
80
+
81
+ # final UI update
82
+ yield (
83
+ f"✅ Indexed {total_videos} videos from {len(urls)} channels.",
84
+ refresh_channel_list(),
85
+ list_channels_radio(),
86
+ )
87
+
88
+
89
+ def youtube_metadata_init(progress: gr.Progress = None):
90
+ channels = (
91
+ "https://www.youtube.com/@onedayonepasuram6126,"
92
+ "https://www.youtube.com/@srisookthi,"
93
+ "https://www.youtube.com/@learn-aksharam,"
94
+ "https://www.youtube.com/@SriYadugiriYathirajaMutt,"
95
+ "https://www.youtube.com/@akivasudev,"
96
+ "https://www.youtube.com/@Arulicheyal_Amutham"
97
+ )
98
+ for msg, upd, upd in index_channels(channels):
99
+ # print(resp)
100
+ yield msg
101
+
102
+
103
+ def refresh_all_channels():
104
+ yt_api_key = os.environ["YOUTUBE_API_KEY"]
105
+ channels = get_indexed_channels(get_collection())
106
+
107
+ if not channels:
108
+ return "⚠️ No channels available to refresh.", refresh_channel_list()
109
+
110
+ # build list of URLs
111
+ urls = []
112
+ for key, val in channels.items():
113
+ url = val.get("channel_url") if isinstance(val, dict) else key
114
+ if url:
115
+ urls.append(url)
116
+
117
+ # re-index all at once
118
+ total_videos = sync_channels_from_youtube(yt_api_key, urls)
119
+
120
+ return (
121
+ f"🔄 Refreshed {len(urls)} channels, re-indexed {total_videos} videos.",
122
+ refresh_channel_list(),
123
+ )
124
+
125
+
126
+ # -------------------------------
127
+ # Channel selection as radio
128
+ # -------------------------------
129
+ def list_channels_radio():
130
+ channels = get_indexed_channels(get_collection())
131
+ choices = []
132
+ for key, val in channels.items():
133
+ if isinstance(val, dict):
134
+ channel_display_name = val.get("channel_title", "Unknown")
135
+ channel_id = val.get("channel_url")
136
+ else:
137
+ channel_display_name = val
138
+ channel_id = key
139
+ if channel_id:
140
+ choices.append((channel_display_name, channel_id))
141
+ # print("choices= ", choices)
142
+ return choices
143
+
144
+
145
+ # Delete a channel
146
+ # -------------------------------
147
+ def delete_channel(channel_url: str):
148
+ delete_channel_from_collection(channel_url)
149
+ # Return updated radio choices
150
+ return refresh_channel_list()
151
+
152
+
153
+ # -------------------------------
154
+ # LLM query
155
+ # -------------------------------
156
+ def handle_query(query: str, search_channel_id: str):
157
+ answer_text, video_html = answer_query(
158
+ query, get_collection(), channel_id=search_channel_id, top_k=10
159
+ )
160
+ if not answer_text:
161
+ answer_text = "No answer available."
162
+ if not video_html or not isinstance(video_html, str):
163
+ video_html = "" # ensure string for gr.HTML
164
+ return answer_text, video_html
165
+
166
+
167
+ # -------------------------------
168
+ # Gradio UI
169
+ # -------------------------------
170
+ with gr.Blocks(title="Sanatana AI - Youtube Metadata Surfer") as youtube_metadata_app:
171
+ gr.Markdown("### 📺 YouTube Channel Surfer")
172
+
173
+ with Modal(visible=False) as download_modal:
174
+ with gr.Row():
175
+ gr.Column()
176
+ download_status = gr.Markdown("## Preparing the file ...")
177
+ gr.Column()
178
+ with gr.Row():
179
+ gr.Column()
180
+ download_ready_btn = gr.DownloadButton(
181
+ label="Click to Download",
182
+ visible=False,
183
+ variant="primary",
184
+ scale=0,
185
+ )
186
+ gr.Column()
187
+
188
+ # Modal to show channel videos
189
+ with Modal(visible=False) as videos_list_modal:
190
+ gr.Markdown("### Videos List")
191
+
192
+ # the HTML table that shows one page of videos
193
+ # modal_html = gr.HTML()
194
+ channel_videos_df = gr.DataFrame(
195
+ show_search=True,
196
+ show_copy_button=True,
197
+ show_fullscreen_button=True,
198
+ datatype=[
199
+ "int",
200
+ "str",
201
+ "str",
202
+ "html",
203
+ ],
204
+ headers=["#", "title", "description", "url"],
205
+ column_widths=["5%", "25%", "60%", "10%"],
206
+ wrap=True,
207
+ col_count=(4, "fixed"),
208
+ )
209
+
210
+ # Modal to add new channels
211
+ with Modal(visible=False) as add_channel_modal:
212
+ channel_input = gr.Textbox(
213
+ label="Channel URLs",
214
+ placeholder="Paste one or more YouTube channel URLs (comma or newline separated)",
215
+ )
216
+ examples = {
217
+ "Comma Separated Channels Example": "https://www.youtube.com/@onedayonepasuram6126,https://www.youtube.com/@srisookthi,https://www.youtube.com/@learn-aksharam,https://www.youtube.com/@SriYadugiriYathirajaMutt",
218
+ "Newline Separated Channels Example": "https://www.youtube.com/@onedayonepasuram6126\nhttps://www.youtube.com/@srisookthi\nhttps://www.youtube.com/@learn-aksharam\nhttps://www.youtube.com/@SriYadugiriYathirajaMutt",
219
+ "One Day One Pasuram": "https://www.youtube.com/@onedayonepasuram6126",
220
+ "Sri Sookthi": "https://www.youtube.com/@srisookthi",
221
+ "Aksharam": "https://www.youtube.com/@learn-aksharam",
222
+ "Cricinfo": "https://www.youtube.com/@espncricinfo",
223
+ "Chanakyaa": "https://www.youtube.com/@ChanakyaaTV",
224
+ "Aptitude Guru": "https://www.youtube.com/@AptitudeGuruHem",
225
+ "Universe Genius": "https://www.youtube.com/@UniverseGenius",
226
+ "Praveen Mohan": "https://www.youtube.com/@RealPraveenMohan",
227
+ "Yathiraja Mutt": "https://www.youtube.com/@SriYadugiriYathirajaMutt",
228
+ "Vasudevan Srinivasachariar": "https://www.youtube.com/@akivasudev",
229
+ }
230
+
231
+ def set_example(label):
232
+ return examples[label]
233
+
234
+ gr.Markdown("Click on any example below and then click on add channels button.")
235
+ with gr.Row():
236
+ for label in examples:
237
+ gr.Button(label, size="sm", variant="huggingface", scale=0).click(
238
+ fn=set_example,
239
+ inputs=gr.State(label),
240
+ outputs=channel_input,
241
+ )
242
+
243
+ with gr.Row():
244
+ gr.Column()
245
+ save_add_channels_btn = gr.Button(
246
+ "Add Channel(s)", scale=0, variant="primary"
247
+ )
248
+ gr.Column()
249
+ index_status = gr.Markdown(label="Index Status", container=False)
250
+
251
+ with gr.Row():
252
+ # Sidebar
253
+ with gr.Sidebar() as my_sidebar:
254
+ gr.Markdown("### 📺 Channels")
255
+ channel_list_values = list_channels_radio()
256
+ channel_list_state = gr.State(channel_list_values)
257
+
258
+ no_channels_message = gr.Markdown(
259
+ "⚠️ **No channels available.**",
260
+ visible=False if channel_list_values else True,
261
+ )
262
+ channel_radio = gr.Radio(
263
+ choices=channel_list_values,
264
+ label="Select a Channel",
265
+ visible=True if channel_list_values else False,
266
+ )
267
+
268
+ with gr.Row():
269
+ export_btn = gr.Button(
270
+ "⏬ Download",
271
+ size="sm",
272
+ scale=0,
273
+ variant="primary",
274
+ interactive=False,
275
+ )
276
+ show_videos_btn = gr.Button(
277
+ "🎬Videos",
278
+ size="sm",
279
+ scale=0,
280
+ variant="secondary",
281
+ interactive=False,
282
+ )
283
+ refresh_btn = gr.Button(
284
+ "⭮ Refresh",
285
+ size="sm",
286
+ scale=0,
287
+ variant="huggingface",
288
+ )
289
+ refresh_all_btn = gr.Button(
290
+ "🔄 Sync from YouTube",
291
+ size="sm",
292
+ scale=0,
293
+ variant="stop",
294
+ visible=False,
295
+ )
296
+ add_channels_btn = gr.Button(
297
+ "➕ Add", size="sm", scale=0, variant="primary"
298
+ )
299
+
300
+ delete_channel_btn = gr.Button(
301
+ "🗑️ Delete", size="sm", scale=0, variant="stop"
302
+ )
303
+
304
+ refresh_status = gr.Markdown(label="Refresh Status", container=False)
305
+
306
+ refresh_all_btn.click(
307
+ fn=refresh_all_channels,
308
+ inputs=None,
309
+ outputs=[refresh_status, channel_radio],
310
+ )
311
+
312
+ refresh_btn.click(fn=refresh_channel_list, outputs=[channel_radio]).then(
313
+ fn=list_channels_radio, outputs=[channel_list_state]
314
+ )
315
+ add_channels_btn.click(close_component, outputs=[my_sidebar]).then(
316
+ show_component, outputs=[add_channel_modal]
317
+ )
318
+
319
+ def toggle_no_data_found(channel_list):
320
+ if channel_list:
321
+ return show_component(), hide_component()
322
+ else:
323
+ return hide_component(), show_component()
324
+
325
+ save_add_channels_btn.click(
326
+ disable_component, outputs=[save_add_channels_btn]
327
+ ).then(
328
+ index_channels,
329
+ inputs=[channel_input],
330
+ outputs=[index_status, channel_radio, channel_list_state],
331
+ ).then(
332
+ hide_component, outputs=[add_channel_modal]
333
+ ).then(
334
+ open_component, outputs=[my_sidebar]
335
+ ).then(
336
+ enable_component, outputs=[save_add_channels_btn]
337
+ ).then(
338
+ toggle_no_data_found,
339
+ inputs=[channel_list_state],
340
+ outputs=[channel_radio, no_channels_message],
341
+ )
342
+ ## Onload refresh the channel list.
343
+ gr.on(fn=refresh_channel_list, outputs=[channel_radio]).then(
344
+ fn=list_channels_radio, outputs=[channel_list_state]
345
+ )
346
+ # Main Column
347
+ main_content_no_channels_html = gr.HTML(
348
+ """
349
+ <div style="
350
+ display: flex;
351
+ justify-content: center;
352
+ align-items: center;
353
+ height: 150px;
354
+ ">
355
+ <div style="
356
+ border: 2px solid #FFA500;
357
+ background-color: #FFF8E1;
358
+ color: #FF6F00;
359
+ padding: 20px 30px;
360
+ border-radius: 12px;
361
+ font-weight: bold;
362
+ font-size: 1.2rem;
363
+ text-align: center;
364
+ box-shadow: 0 4px 10px rgba(0,0,0,0.1);
365
+ ">
366
+ ⚠️ No channels added.<br>
367
+ Please add channels from the side bar
368
+ </div>
369
+ </div>
370
+
371
+ """,
372
+ visible=True if not channel_list_state.value else False,
373
+ )
374
+ with gr.Column(
375
+ scale=3, visible=True if channel_list_state.value else False
376
+ ) as main_content:
377
+ with gr.Row():
378
+ search_channel = gr.Dropdown(
379
+ label="Select a Channel",
380
+ choices=[("All Channels", None)] + channel_list_state.value,
381
+ value=None,
382
+ )
383
+ question = gr.Textbox(
384
+ label="Ask a Question",
385
+ placeholder="e.g., How to write the letter Aa in grantham?",
386
+ submit_btn=True,
387
+ )
388
+ gr.Column(scale=2)
389
+
390
+ gr.Examples(
391
+ [
392
+ "Srirangam",
393
+ "Gajendra moksham",
394
+ "Poorvikalyani",
395
+ "Virutham from chathusloki",
396
+ "Lesson 9.15 from Aksharam",
397
+ ],
398
+ inputs=question,
399
+ )
400
+
401
+ submitted_question = gr.Markdown()
402
+ ask_status = gr.Markdown()
403
+ answer = gr.Markdown()
404
+ video_embed = gr.HTML() # iframe embeds
405
+
406
+ def get_question(q):
407
+ return f"## You asked : {q}\n---"
408
+
409
+ # question.change(enable_if_not_none, inputs=[question], outputs=[question])
410
+ question.submit(show_loading, inputs=[question], outputs=[ask_status]).then(
411
+ get_question, inputs=[question], outputs=[submitted_question]
412
+ ).then(disable_component, outputs=[question]).then(
413
+ handle_query,
414
+ inputs=[question, search_channel],
415
+ outputs=[answer, video_embed],
416
+ ).then(
417
+ enable_component, outputs=[question]
418
+ ).then(
419
+ clear_component, outputs=[ask_status]
420
+ )
421
+
422
+ # Show videos modal when button clicked
423
+ def show_selected_channel_videos(selected_channel_id):
424
+ # print("selected_channel_id = ", selected_channel_id)
425
+ df = fetch_channel_dataframe(selected_channel_id)
426
+ return gr.update(value=df, label=f"{len(df)} videos")
427
+
428
+ channel_radio.change(
429
+ enable_if_not_none, inputs=[channel_radio], outputs=[show_videos_btn]
430
+ ).then(enable_if_not_none, inputs=[channel_radio], outputs=[export_btn])
431
+ show_videos_btn.click(disable_component, outputs=[show_videos_btn]).then(
432
+ close_component, outputs=[my_sidebar]
433
+ ).then(
434
+ show_selected_channel_videos,
435
+ inputs=[channel_radio],
436
+ outputs=[channel_videos_df],
437
+ ).then(
438
+ show_component, outputs=[videos_list_modal]
439
+ ).then(
440
+ enable_component, outputs=[show_videos_btn]
441
+ )
442
+
443
+ delete_channel_btn.click(
444
+ disable_component, outputs=[delete_channel_btn]
445
+ ).then(
446
+ delete_channel, # function
447
+ inputs=[channel_radio], # selected channel name
448
+ outputs=[channel_radio], # update the radio choices
449
+ ).then(
450
+ enable_component, outputs=[delete_channel_btn]
451
+ )
452
+ channel_list_state.change(
453
+ toggle_no_data_found,
454
+ inputs=[channel_list_state],
455
+ outputs=[main_content, main_content_no_channels_html],
456
+ ).then(
457
+ toggle_no_data_found,
458
+ inputs=[channel_list_state],
459
+ outputs=[channel_radio, no_channels_message],
460
+ )
461
+
462
+ def get_channel_choices(channel_list):
463
+ return gr.update(choices=[("All Channels", None)] + channel_list)
464
+
465
+ channel_list_state.change(
466
+ get_channel_choices, inputs=[channel_list_state], outputs=[search_channel]
467
+ )
468
+
469
+ export_btn.click(close_component, outputs=[my_sidebar]).then(
470
+ show_component, outputs=[download_status]
471
+ ).then(hide_component, outputs=[download_ready_btn]).then(
472
+ show_component, outputs=[download_modal]
473
+ ).then(
474
+ export_channel_json, inputs=channel_radio, outputs=download_ready_btn
475
+ ).then(
476
+ hide_component, outputs=[download_status]
477
+ ).then(
478
+ show_component, outputs=[download_ready_btn]
479
+ )
480
+
481
+
482
+ def initialize_youtube_metadata_and_poll():
483
+ # Step 1: Initialize metadata
484
+ for msg in youtube_metadata_init():
485
+ print(msg)
486
+
487
+ # Step 2: Start polling after init
488
+ start_poll() # run in the same thread
489
+ # OR if you want it in a separate daemon thread:
490
+ # poll_thread = threading.Thread(target=start_poll, daemon=True)
491
+ # poll_thread.start()
492
+
493
+ # Launch the whole thing in a background thread
494
+ yt_init_thread = threading.Thread(target=initialize_youtube_metadata_and_poll, daemon=True)
495
+ yt_init_thread.start()
496
+
497
+ if __name__ == "__main__":
498
+ initialize_youtube_metadata_and_poll()
499
+ # Start polling in a background thread
500
+ youtube_metadata_app.launch()
modules/youtube_metadata/channel_utils.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from modules.youtube_metadata.db import get_collection
2
+ import pandas as pd
3
+
4
+ page_size = 10 # change if you like
5
+
6
+
7
+ # -------------------------------
8
+ # Fetch channel videos as HTML table with pagination
9
+ # -------------------------------
10
+ def fetch_channel_html(channel_id: str, page: int = 1, page_size: int = 10):
11
+ collection = get_collection()
12
+ offset = (page - 1) * page_size
13
+
14
+ all_results = collection.get(
15
+ where={"channel_id": channel_id}, include=["metadatas"]
16
+ )
17
+ total_count = (
18
+ len(all_results["metadatas"])
19
+ if all_results and "metadatas" in all_results
20
+ else 0
21
+ )
22
+ results = collection.get(
23
+ where={"channel_id": channel_id},
24
+ include=["documents", "metadatas"],
25
+ limit=page_size,
26
+ offset=offset,
27
+ )
28
+
29
+ # handle empty
30
+ if not results or not results.get("metadatas"):
31
+ return f"""
32
+ <div style="display:flex;justify-content:center;align-items:center;
33
+ height:200px;flex-direction:column;color:#666;">
34
+ ⚠️ No videos found for this channel (page {page}).
35
+ </div>
36
+ """
37
+
38
+ videos = results["metadatas"]
39
+
40
+ # build table
41
+ html = (
42
+ f"<div>Total: {total_count} videos</div>"
43
+ + """
44
+ <table border="1" style="border-collapse:collapse;width:100%;font-family:sans-serif;">
45
+ <thead style="background:#f0f0f0;">
46
+ <tr>
47
+ <th>#</th>
48
+ <th>Title</th>
49
+ <th>Video URL</th>
50
+ <th>Description</th>
51
+ </tr>
52
+ </thead>
53
+ <tbody>
54
+ """
55
+ )
56
+
57
+ for idx, v in enumerate(videos, start=offset + 1):
58
+ html += f"""
59
+ <tr>
60
+ <td>{idx}</td>
61
+ <td>{v.get('video_title','')}</td>
62
+ <td><a href="https://youtube.com/watch?v={v.get('video_id')}"
63
+ target="_blank">Watch Video</a></td>
64
+ <td>{v.get('description','')}</td>
65
+ </tr>
66
+ """
67
+
68
+ html += "</tbody></table>"
69
+ return html
70
+
71
+
72
+ # -------------------------------
73
+ # Fetch channel videos as HTML table with pagination
74
+ # -------------------------------
75
+ def fetch_channel_dataframe(channel_id: str):
76
+ collection = get_collection()
77
+
78
+ results = collection.get(
79
+ where={"channel_id": channel_id}, include=["documents", "metadatas"]
80
+ )
81
+ total_count = len(results["metadatas"]) if results and "metadatas" in results else 0
82
+ # handle empty
83
+ if not results or not results.get("metadatas"):
84
+ return pd.DataFrame(data=[])
85
+
86
+ videos = results["metadatas"]
87
+
88
+ items = []
89
+ for idx, v in enumerate(videos, start=1):
90
+ item = {
91
+ "#": idx,
92
+ "title": v.get("video_title", "-"),
93
+ "description": v.get("description", ""),
94
+ "url": f"""<a style="color: blue" href="https://youtube.com/watch?v={v.get('video_id')}"
95
+ target="_blank">▶️Watch Video</a>""",
96
+ }
97
+ items.append(item)
98
+ return pd.DataFrame(data=items)
99
+
100
+
101
+ def update_table(channel_id, page):
102
+ return fetch_channel_html(channel_id, page, page_size), f"Page {page}"
103
+
104
+
105
+ def prev_page(channel_id, page):
106
+ new_page = max(1, page - 1)
107
+ return (
108
+ fetch_channel_html(channel_id, new_page, page_size),
109
+ f"Page {new_page}",
110
+ new_page,
111
+ )
112
+
113
+
114
+ def next_page(channel_id, page):
115
+ new_page = page + 1
116
+ return (
117
+ fetch_channel_html(channel_id, new_page, page_size),
118
+ f"Page {new_page}",
119
+ new_page,
120
+ )
modules/youtube_metadata/collector.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -------------------------------
2
+ # 1. Collector
3
+ # -------------------------------
4
+ from googleapiclient.discovery import build
5
+ from modules.youtube_metadata.youtube_utils import get_channel_id
6
+ import logging
7
+
8
+ logging.basicConfig()
9
+ logger=logging.getLogger(__name__)
10
+ logger.setLevel(logging.INFO)
11
+
12
+ def fetch_all_channel_videos(api_key: str, channel_url: str, max_results_per_call=50):
13
+ youtube = build("youtube", "v3", developerKey=api_key)
14
+ channel_id = get_channel_id(youtube, channel_url)
15
+
16
+ final_videos = []
17
+ for videos in fetch_channel_videos_by_id(api_key, channel_id, max_results_per_call):
18
+ final_videos.extend(videos)
19
+ logger.info("fetch_all_channel_videos: Fetched %d", len(final_videos))
20
+ yield (f"Fetched {len(final_videos)}", videos) # <-- only yield the *new* batch
21
+
22
+ yield (f"Fetched {len(final_videos)}", []) # final "summary"
23
+
24
+
25
+ def fetch_channel_videos_by_id(api_key: str, channel_id: str, max_results=50):
26
+ youtube = build("youtube", "v3", developerKey=api_key)
27
+
28
+ # Get uploads playlist ID
29
+ channel_response = youtube.channels().list(
30
+ part="contentDetails,snippet", id=channel_id
31
+ ).execute()
32
+
33
+ channel_title = channel_response["items"][0]["snippet"]["title"]
34
+ uploads_playlist_id = channel_response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
35
+
36
+ next_page_token = None
37
+
38
+ while True:
39
+ request = youtube.playlistItems().list(
40
+ part="snippet",
41
+ playlistId=uploads_playlist_id,
42
+ maxResults=max_results,
43
+ pageToken=next_page_token,
44
+ )
45
+ response = request.execute()
46
+
47
+ videos = []
48
+ for item in response.get("items", []):
49
+ snippet = item["snippet"]
50
+ videos.append(
51
+ {
52
+ "video_id": snippet["resourceId"]["videoId"],
53
+ "title": snippet["title"],
54
+ "description": snippet.get("description", ""),
55
+ "channel_id": channel_id,
56
+ "channel_title": channel_title,
57
+ }
58
+ )
59
+
60
+ yield videos # yield one page worth
61
+
62
+ next_page_token = response.get("nextPageToken")
63
+ if not next_page_token:
64
+ break
65
+
modules/youtube_metadata/db.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+
3
+ from config import SanatanConfig
4
+
5
+ config = SanatanConfig()
6
+ YT_METADATA_COLLECTION_NAME = config.get_collection_name(scripture_name="yt_metadata")
7
+
8
+ def get_client():
9
+ client = chromadb.PersistentClient(path=config.dbStorePath)
10
+ return client
11
+
12
+
13
+ def get_collection():
14
+ client = get_client()
15
+
16
+ # Ensure fresh collection with correct dimension
17
+ try:
18
+ collection = client.get_collection(YT_METADATA_COLLECTION_NAME)
19
+ except Exception:
20
+ collection = client.create_collection(YT_METADATA_COLLECTION_NAME)
21
+
22
+ # # Check dimension mismatch
23
+ # try:
24
+ # # quick test query
25
+ # collection.query(query_embeddings=[[0.0] * 1536], n_results=1)
26
+ # except Exception:
27
+ # # Delete and recreate with fresh schema
28
+ # client.delete_collection("yt_metadata")
29
+ # collection = client.create_collection("yt_metadata")
30
+
31
+ return collection
32
+
33
+
34
+ # modules/db.py
35
+ def get_indexed_channels(collection=get_collection()):
36
+ results = collection.get(include=["metadatas"])
37
+ channels = {}
38
+
39
+ for meta in results["metadatas"]:
40
+ cid = meta.get("channel_id") # ✅ safe
41
+ cname = meta.get("channel_title", "Unknown Channel")
42
+
43
+ if cid: # only include if we have a channel_id
44
+ channels[cid] = cname
45
+ # print("channels= ",channels)
46
+ return channels
47
+
48
+
49
+ # -------------------------------
50
+ # Delete a channel
51
+ # -------------------------------
52
+ def delete_channel_from_collection(channel_id: str):
53
+ """Remove a channel from the index and refresh the radio choices."""
54
+ # Delete all videos for this channel
55
+ # print("Deleting channel", channel_id)
56
+
57
+ # print("data = ", data)
58
+ get_collection().delete(where={"channel_id": channel_id})
59
+
60
+
61
+ def fetch_channel_data(channel_id: str):
62
+ data = get_collection().get(
63
+ where={"channel_id": channel_id}, include=["embeddings", "metadatas", "documents"]
64
+ )
65
+ return data
modules/youtube_metadata/downloader.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import tempfile
4
+ import os
5
+
6
+ from modules.youtube_metadata.db import fetch_channel_data
7
+
8
+ def json_serializer(obj):
9
+ if hasattr(obj, "tolist"): # NumPy arrays
10
+ return obj.tolist()
11
+ return str(obj)
12
+
13
+ def export_channel_json(channel_id):
14
+ data = fetch_channel_data(channel_id)
15
+
16
+ # Save to a temporary JSON file
17
+ fd, path = tempfile.mkstemp(suffix=".json")
18
+ with os.fdopen(fd, "w", encoding="utf-8") as f:
19
+ json.dump(data, f, indent=2, ensure_ascii=False, default=json_serializer)
20
+ return path
modules/youtube_metadata/embeddings.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from openai import OpenAI
3
+ from dotenv import load_dotenv
4
+ load_dotenv()
5
+
6
+
7
+ # Step 1: Load SentenceTransformer model
8
+ # Old MiniLM version:
9
+ # model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
10
+
11
+ # Better MPNet alternative:
12
+ model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
13
+ client = OpenAI()
14
+
15
+ def _get_hf_embedding(text: str) -> list:
16
+ return model.encode(text).tolist()
17
+
18
+ def _get_openai_embedding(text: str) -> list:
19
+ response = client.embeddings.create(
20
+ model="text-embedding-3-large", # or "text-embedding-3-large"
21
+ input=text
22
+ )
23
+ return response.data[0].embedding
24
+
25
+
26
+ def get_embedding(text: str) -> list:
27
+ """
28
+ Switch according to the embedding model you want.
29
+ """
30
+ # return _get_hf_embedding(text)
31
+ return _get_openai_embedding(text)
modules/youtube_metadata/indexer.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modules/indexer.py
2
+ from typing import Dict, List
3
+ from openai import OpenAI
4
+ from modules.youtube_metadata.embeddings import get_embedding
5
+ import logging
6
+
7
+ logging.basicConfig()
8
+ logger = logging.getLogger(__name__)
9
+ logger.setLevel(logging.INFO)
10
+
11
+
12
+ def index_videos(
13
+ videos: List[Dict], collection, channel_url: str, batch_size: int = 50
14
+ ):
15
+ client = OpenAI()
16
+
17
+ total = len(videos)
18
+ logger.info(
19
+ f"index_videos: [INDEX] Starting indexing for {total} videos (channel={channel_url})"
20
+ )
21
+
22
+ # Split into batches
23
+ for start in range(0, total, batch_size):
24
+ batch = videos[start : start + batch_size]
25
+ end = start + len(batch)
26
+ percent = round((end / total) * 100, 1)
27
+
28
+ logger.info(
29
+ f"index_videos: [INDEX] Processing batch {start+1} → {end} of {total} — {percent}%"
30
+ )
31
+
32
+ # Prepare text inputs
33
+ texts = [
34
+ f"{vid.get('title', '')} - {vid.get('description', '')}" for vid in batch
35
+ ]
36
+
37
+ embeddings = [get_embedding(text) for text in texts]
38
+
39
+ # Build metadata + ids
40
+ metadatas, ids = [], []
41
+ for vid in batch:
42
+ metadata = {
43
+ "video_id": vid.get("video_id"),
44
+ "video_title": vid.get("title", ""),
45
+ "description": vid.get("description", ""),
46
+ "channel_url": channel_url,
47
+ }
48
+ if "channel_id" in vid:
49
+ metadata["channel_id"] = vid["channel_id"]
50
+ if "channel_title" in vid:
51
+ metadata["channel_title"] = vid["channel_title"]
52
+
53
+ metadatas.append(metadata)
54
+ ids.append(vid.get("video_id"))
55
+
56
+ # Insert in bulk
57
+ collection.add(
58
+ documents=texts,
59
+ embeddings=embeddings,
60
+ metadatas=metadatas,
61
+ ids=ids,
62
+ )
63
+
64
+ logger.info(
65
+ f"index_videos: [INDEX] ✅ Indexed {len(batch)} videos (total so far: {end}/{total} — {percent}%)"
66
+ )
67
+
68
+ logger.info(
69
+ f"index_videos: [INDEX] 🎉 Finished indexing {total} videos for channel={channel_url}"
70
+ )
71
+ return total
modules/youtube_metadata/retriever.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modules/retriever.py
2
+ from typing import List, Dict
3
+ from openai import OpenAI
4
+
5
+ from modules.youtube_metadata.embeddings import get_embedding
6
+
7
+
8
+ def retrieve_videos(
9
+ query: str, collection, top_k: int = 3, channel_id: str = None
10
+ ) -> List[Dict]:
11
+ client = OpenAI()
12
+
13
+ # Create embedding for query
14
+ embedding = get_embedding(query)
15
+
16
+ # Query Chroma
17
+ if not channel_id:
18
+ results = collection.query(
19
+ query_embeddings=[embedding],
20
+ n_results=top_k,
21
+ include=["metadatas", "documents", "distances"],
22
+ )
23
+ else:
24
+ results = collection.query(
25
+ query_embeddings=[embedding],
26
+ n_results=top_k,
27
+ include=["metadatas", "documents", "distances"],
28
+ where={"channel_id": channel_id},
29
+ )
30
+ # Build list of standardized dicts
31
+ videos = []
32
+ metadatas_list = results.get("metadatas", [[]])[0] # list of metadata dicts
33
+ documents_list = results.get("documents", [[]])[0] # list of text
34
+ distances_list = results.get("distances", [[]])[0] # optional
35
+
36
+ for idx, meta in enumerate(metadatas_list):
37
+ videos.append(
38
+ {
39
+ "video_id": meta.get("video_id", ""),
40
+ "video_title": meta.get(
41
+ "video_title", meta.get("title", documents_list[idx])
42
+ ),
43
+ "channel": meta.get("channel", meta.get("channel_title", "")),
44
+ "description": documents_list[idx] if idx < len(documents_list) else "",
45
+ "score": distances_list[idx] if idx < len(distances_list) else None,
46
+ }
47
+ )
48
+
49
+ return videos
modules/youtube_metadata/youtube_poller.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from chromadb import Collection
2
+ import feedparser
3
+ from modules.youtube_metadata.db import get_collection, get_indexed_channels
4
+ from modules.youtube_metadata.embeddings import get_embedding
5
+ import logging
6
+
7
+ logging.basicConfig()
8
+ logger = logging.getLogger(__name__)
9
+ logger.setLevel(logging.INFO)
10
+
11
+
12
+ def fetch_channel_videos_rss(channel_id, max_results=50):
13
+ feed_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"
14
+ logger.info("fetch_channel_videos_rss: feed_url = %s", feed_url)
15
+ feed = feedparser.parse(feed_url)
16
+
17
+ # Capture channel title from the <feed> section
18
+ channel_title = getattr(feed.feed, "title", None)
19
+ channel_url = getattr(feed.feed, "link", None)
20
+ channel_author = getattr(feed.feed, "author", "")
21
+
22
+ logger.info("fetch_channel_videos_rss: channel_title = %s", channel_title)
23
+
24
+ videos = []
25
+ for entry in feed.entries[:max_results]:
26
+ description = (
27
+ getattr(getattr(entry, "title_detail", None), "value", "")
28
+ or getattr(entry, "media_description", None)
29
+ or getattr(entry, "summary", None)
30
+ or ""
31
+ )
32
+
33
+ videos.append(
34
+ {
35
+ "video_id": entry.yt_videoid,
36
+ "video_title": entry.title,
37
+ "description": description,
38
+ "published": entry.published,
39
+ "video_url": entry.link,
40
+ "channel_url": channel_url,
41
+ "channel_id": channel_id,
42
+ "channel_title": channel_title,
43
+ "channel_author" : channel_author,
44
+ }
45
+ )
46
+
47
+ return videos
48
+
49
+
50
+ def get_existing_video_ids(collection, channel_id):
51
+ # n_results: how many results to fetch; use a high number to get all entries
52
+ results = collection.get(where={"channel_id": channel_id})
53
+
54
+ existing_ids = set()
55
+ for metadata in results.get("metadatas", []):
56
+ if metadata and "video_id" in metadata:
57
+ existing_ids.add(metadata["video_id"])
58
+ return existing_ids
59
+
60
+
61
+ def filter_new_videos(videos, existing_ids):
62
+ return [v for v in videos if v["video_id"] not in existing_ids]
63
+
64
+
65
+ def add_to_chroma(collection: Collection, new_videos):
66
+ if not new_videos:
67
+ return
68
+ collection.add(
69
+ documents=[v["title"] for v in new_videos],
70
+ embeddings=[get_embedding(v["title"]) for v in new_videos],
71
+ metadatas=[
72
+ {
73
+ "video_id": v["video_id"],
74
+ "channel_id": v["channel_id"],
75
+ "link": v["link"],
76
+ }
77
+ for v in new_videos
78
+ ],
79
+ ids=[v["video_id"] for v in new_videos],
80
+ )
81
+
82
+
83
+ def incremental_update(collection, channel_id):
84
+ existing_ids = get_existing_video_ids(collection, channel_id)
85
+ latest_videos = fetch_channel_videos_rss(channel_id)
86
+ new_videos = filter_new_videos(latest_videos, existing_ids)
87
+
88
+ if new_videos:
89
+ add_to_chroma(collection, new_videos)
90
+ logger.info(
91
+ f"incremental_update: Added {len(new_videos)} new videos from {channel_id}"
92
+ )
93
+ else:
94
+ logger.info(f"incremental_uddate: No new videos for {channel_id}")
95
+
96
+
97
+ def start_poll():
98
+ import time
99
+
100
+ configured_channels = get_indexed_channels().keys()
101
+
102
+ while True:
103
+ for channel_id in configured_channels:
104
+ incremental_update(get_collection(), channel_id)
105
+ time.sleep(600) # 10 minutes
modules/youtube_metadata/youtube_sync.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+ import gradio as gr
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+
5
+ from modules.youtube_metadata.collector import fetch_all_channel_videos
6
+ from modules.youtube_metadata.db import get_collection
7
+ from modules.youtube_metadata.indexer import index_videos
8
+
9
+ # global stop signal
10
+ stop_event = threading.Event()
11
+ MAX_BATCHES = 200 # safety cutoff
12
+
13
+ def stop_sync():
14
+ """External call to stop the sync process."""
15
+ stop_event.set()
16
+
17
+ def sync_channels_from_youtube(api_key, channel_urls: list, progress: gr.Progress = None):
18
+ """
19
+ Sync multiple channels, yielding (progress_message, videos_indexed_in_batch)
20
+ """
21
+ global stop_event
22
+ stop_event.clear()
23
+
24
+ total_channels = len(channel_urls)
25
+ total_videos = 0
26
+
27
+ for idx, channel_url in enumerate(channel_urls, 1):
28
+ if stop_event.is_set():
29
+ yield f"🛑 Stopped before processing channel: {channel_url}", 0
30
+ break
31
+
32
+ yield f"🔄 Syncing {channel_url} ({idx}/{total_channels})", 0
33
+
34
+ # stream video-level progress from inner generator
35
+ for update_message, batch_count in _refresh_single_channel(api_key, channel_url, progress):
36
+ total_videos += batch_count
37
+ yield update_message, batch_count
38
+
39
+ yield f"✅ Finished syncing. Total channels: {total_channels}, total videos: {total_videos}", 0
40
+
41
+
42
+ def _refresh_single_channel(api_key, channel_url, progress):
43
+ # fetch all batches first
44
+ fetched_batches = list(fetch_all_channel_videos(api_key, channel_url))
45
+ all_videos = [v | {"channel_url": channel_url} for _, batch in fetched_batches for v in batch]
46
+ total_videos = len(all_videos)
47
+
48
+ if total_videos == 0:
49
+ yield f"{channel_url}: No videos found", 0
50
+ return
51
+
52
+ with ThreadPoolExecutor(max_workers=4) as executor:
53
+ futures = [
54
+ executor.submit(index_videos, batch, get_collection(), channel_url=channel_url)
55
+ for _, batch in fetched_batches
56
+ ]
57
+
58
+ completed_videos = 0
59
+ for f in as_completed(futures):
60
+ if stop_event.is_set():
61
+ yield "🛑 Stop requested during indexing stage", completed_videos
62
+ break
63
+
64
+ try:
65
+ indexed_count = f.result()
66
+ if indexed_count is None:
67
+ indexed_count = len(all_videos) # fallback if index_videos doesn't return
68
+ except Exception as e:
69
+ indexed_count = 0
70
+ yield f"⚠️ Error indexing {channel_url}: {e}", completed_videos
71
+
72
+ completed_videos += indexed_count
73
+ pct = 100.0 * completed_videos / max(1, total_videos)
74
+
75
+ if progress:
76
+ progress(completed_videos / total_videos)
77
+
78
+ yield f"{channel_url}: Indexed {completed_videos}/{total_videos} videos — {pct:.1f}%", completed_videos
modules/youtube_metadata/youtube_utils.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_channel_id(youtube, channel_url: str) -> str:
2
+ """
3
+ Extract channel ID from a YouTube URL or handle.
4
+ Supports:
5
+ - https://www.youtube.com/channel/UCxxxx
6
+ - https://www.youtube.com/@handle
7
+ - @handle
8
+ """
9
+ # If already a UC... ID
10
+ if "channel/" in channel_url:
11
+ return channel_url.split("channel/")[-1].split("/")[0]
12
+
13
+ # If it's a handle (@xyz or full URL)
14
+ if "@" in channel_url:
15
+ handle = channel_url.split("@")[-1]
16
+ request = youtube.channels().list(
17
+ part="id",
18
+ forHandle=handle
19
+ )
20
+ response = request.execute()
21
+ return response["items"][0]["id"]
22
+
23
+ if channel_url.startswith("UC"):
24
+ return channel_url
25
+
26
+ raise ValueError(f"Unsupported channel URL format {channel_url}")
pyproject.toml CHANGED
@@ -8,10 +8,12 @@ dependencies = [
8
  "chromadb>=1.0.15",
9
  "dotenv>=0.9.9",
10
  "dropbox>=12.0.2",
 
11
  "google-api-python-client>=2.177.0",
12
  "google-auth-httplib2>=0.2.0",
13
  "google-auth-oauthlib>=1.2.2",
14
  "gradio>=5.39.0",
 
15
  "gspread>=6.2.1",
16
  "langchain>=0.3.27",
17
  "langchain-community>=0.3.27",
 
8
  "chromadb>=1.0.15",
9
  "dotenv>=0.9.9",
10
  "dropbox>=12.0.2",
11
+ "feedparser>=6.0.12",
12
  "google-api-python-client>=2.177.0",
13
  "google-auth-httplib2>=0.2.0",
14
  "google-auth-oauthlib>=1.2.2",
15
  "gradio>=5.39.0",
16
+ "gradio-modal>=0.0.4",
17
  "gspread>=6.2.1",
18
  "langchain>=0.3.27",
19
  "langchain-community>=0.3.27",
uv.lock CHANGED
@@ -502,6 +502,18 @@ wheels = [
502
  { url = "https://files.pythonhosted.org/packages/e5/47/d63c60f59a59467fda0f93f46335c9d18526d7071f025cb5b89d5353ea42/fastapi-0.116.1-py3-none-any.whl", hash = "sha256:c46ac7c312df840f0c9e220f7964bada936781bc4e2e6eb71f1c4d7553786565", size = 95631, upload-time = "2025-07-11T16:22:30.485Z" },
503
  ]
504
 
 
 
 
 
 
 
 
 
 
 
 
 
505
  [[package]]
506
  name = "ffmpy"
507
  version = "0.6.1"
@@ -740,6 +752,18 @@ wheels = [
740
  { url = "https://files.pythonhosted.org/packages/e0/38/7f50ae95de8fa419276742230f57a34e8c0f47231da0ad54479dd0088972/gradio_client-1.11.0-py3-none-any.whl", hash = "sha256:afb714aea50224f6f04679fe2ce79c1be75011012d0dc3b3ee575610a0dc8eb2", size = 324452, upload-time = "2025-07-17T02:02:44.542Z" },
741
  ]
742
 
 
 
 
 
 
 
 
 
 
 
 
 
743
  [[package]]
744
  name = "greenlet"
745
  version = "3.2.3"
@@ -2773,10 +2797,12 @@ dependencies = [
2773
  { name = "chromadb" },
2774
  { name = "dotenv" },
2775
  { name = "dropbox" },
 
2776
  { name = "google-api-python-client" },
2777
  { name = "google-auth-httplib2" },
2778
  { name = "google-auth-oauthlib" },
2779
  { name = "gradio" },
 
2780
  { name = "gspread" },
2781
  { name = "langchain" },
2782
  { name = "langchain-community" },
@@ -2792,10 +2818,12 @@ requires-dist = [
2792
  { name = "chromadb", specifier = ">=1.0.15" },
2793
  { name = "dotenv", specifier = ">=0.9.9" },
2794
  { name = "dropbox", specifier = ">=12.0.2" },
 
2795
  { name = "google-api-python-client", specifier = ">=2.177.0" },
2796
  { name = "google-auth-httplib2", specifier = ">=0.2.0" },
2797
  { name = "google-auth-oauthlib", specifier = ">=1.2.2" },
2798
  { name = "gradio", specifier = ">=5.39.0" },
 
2799
  { name = "gspread", specifier = ">=6.2.1" },
2800
  { name = "langchain", specifier = ">=0.3.27" },
2801
  { name = "langchain-community", specifier = ">=0.3.27" },
@@ -2928,6 +2956,12 @@ wheels = [
2928
  { url = "https://files.pythonhosted.org/packages/a3/dc/17031897dae0efacfea57dfd3a82fdd2a2aeb58e0ff71b77b87e44edc772/setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922", size = 1201486, upload-time = "2025-05-27T00:56:49.664Z" },
2929
  ]
2930
 
 
 
 
 
 
 
2931
  [[package]]
2932
  name = "shellingham"
2933
  version = "1.5.4"
 
502
  { url = "https://files.pythonhosted.org/packages/e5/47/d63c60f59a59467fda0f93f46335c9d18526d7071f025cb5b89d5353ea42/fastapi-0.116.1-py3-none-any.whl", hash = "sha256:c46ac7c312df840f0c9e220f7964bada936781bc4e2e6eb71f1c4d7553786565", size = 95631, upload-time = "2025-07-11T16:22:30.485Z" },
503
  ]
504
 
505
+ [[package]]
506
+ name = "feedparser"
507
+ version = "6.0.12"
508
+ source = { registry = "https://pypi.org/simple" }
509
+ dependencies = [
510
+ { name = "sgmllib3k" },
511
+ ]
512
+ sdist = { url = "https://files.pythonhosted.org/packages/dc/79/db7edb5e77d6dfbc54d7d9df72828be4318275b2e580549ff45a962f6461/feedparser-6.0.12.tar.gz", hash = "sha256:64f76ce90ae3e8ef5d1ede0f8d3b50ce26bcce71dd8ae5e82b1cd2d4a5f94228", size = 286579, upload-time = "2025-09-10T13:33:59.486Z" }
513
+ wheels = [
514
+ { url = "https://files.pythonhosted.org/packages/4e/eb/c96d64137e29ae17d83ad2552470bafe3a7a915e85434d9942077d7fd011/feedparser-6.0.12-py3-none-any.whl", hash = "sha256:6bbff10f5a52662c00a2e3f86a38928c37c48f77b3c511aedcd51de933549324", size = 81480, upload-time = "2025-09-10T13:33:58.022Z" },
515
+ ]
516
+
517
  [[package]]
518
  name = "ffmpy"
519
  version = "0.6.1"
 
752
  { url = "https://files.pythonhosted.org/packages/e0/38/7f50ae95de8fa419276742230f57a34e8c0f47231da0ad54479dd0088972/gradio_client-1.11.0-py3-none-any.whl", hash = "sha256:afb714aea50224f6f04679fe2ce79c1be75011012d0dc3b3ee575610a0dc8eb2", size = 324452, upload-time = "2025-07-17T02:02:44.542Z" },
753
  ]
754
 
755
+ [[package]]
756
+ name = "gradio-modal"
757
+ version = "0.0.4"
758
+ source = { registry = "https://pypi.org/simple" }
759
+ dependencies = [
760
+ { name = "gradio" },
761
+ ]
762
+ sdist = { url = "https://files.pythonhosted.org/packages/e2/fd/3b383f9ee8d60625e9e26871ba4adcacbedeab132041b94290758e02e543/gradio_modal-0.0.4.tar.gz", hash = "sha256:717ae699072a171648cfa1b84bc153be84e92d04e9ad58c1bc59af68ef332726", size = 1180812, upload-time = "2024-10-15T23:46:06.134Z" }
763
+ wheels = [
764
+ { url = "https://files.pythonhosted.org/packages/05/3d/76f454de84ae1dccbf2b7023e933afb8dde5fdd89e9476786726ef770737/gradio_modal-0.0.4-py3-none-any.whl", hash = "sha256:d96e817d2e934d9e1b835b06474f45fd349b5ccea499d1536bfb4bd38f62dedb", size = 1106241, upload-time = "2024-10-15T23:46:04.13Z" },
765
+ ]
766
+
767
  [[package]]
768
  name = "greenlet"
769
  version = "3.2.3"
 
2797
  { name = "chromadb" },
2798
  { name = "dotenv" },
2799
  { name = "dropbox" },
2800
+ { name = "feedparser" },
2801
  { name = "google-api-python-client" },
2802
  { name = "google-auth-httplib2" },
2803
  { name = "google-auth-oauthlib" },
2804
  { name = "gradio" },
2805
+ { name = "gradio-modal" },
2806
  { name = "gspread" },
2807
  { name = "langchain" },
2808
  { name = "langchain-community" },
 
2818
  { name = "chromadb", specifier = ">=1.0.15" },
2819
  { name = "dotenv", specifier = ">=0.9.9" },
2820
  { name = "dropbox", specifier = ">=12.0.2" },
2821
+ { name = "feedparser", specifier = ">=6.0.12" },
2822
  { name = "google-api-python-client", specifier = ">=2.177.0" },
2823
  { name = "google-auth-httplib2", specifier = ">=0.2.0" },
2824
  { name = "google-auth-oauthlib", specifier = ">=1.2.2" },
2825
  { name = "gradio", specifier = ">=5.39.0" },
2826
+ { name = "gradio-modal", specifier = ">=0.0.4" },
2827
  { name = "gspread", specifier = ">=6.2.1" },
2828
  { name = "langchain", specifier = ">=0.3.27" },
2829
  { name = "langchain-community", specifier = ">=0.3.27" },
 
2956
  { url = "https://files.pythonhosted.org/packages/a3/dc/17031897dae0efacfea57dfd3a82fdd2a2aeb58e0ff71b77b87e44edc772/setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922", size = 1201486, upload-time = "2025-05-27T00:56:49.664Z" },
2957
  ]
2958
 
2959
+ [[package]]
2960
+ name = "sgmllib3k"
2961
+ version = "1.0.0"
2962
+ source = { registry = "https://pypi.org/simple" }
2963
+ sdist = { url = "https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9", size = 5750, upload-time = "2010-08-24T14:33:52.445Z" }
2964
+
2965
  [[package]]
2966
  name = "shellingham"
2967
  version = "1.5.4"