Spaces:

vikramvasudevan
/

sanatan_ai

Running on CPU Upgrade

App Files Files Community

sanatan_ai / modules /youtube_metadata /collector.py

vikramvasudevan

Upload folder using huggingface_hub

4aebf77 verified about 2 months ago

raw

history blame contribute delete

2.24 kB

	# -------------------------------
	# 1. Collector
	# -------------------------------
	from googleapiclient.discovery import build
	from modules.youtube_metadata.youtube_utils import get_channel_id
	import logging

	logging.basicConfig()
	logger=logging.getLogger(__name__)
	logger.setLevel(logging.INFO)

	def fetch_all_channel_videos(api_key: str, channel_url: str, max_results_per_call=50):
	youtube = build("youtube", "v3", developerKey=api_key)
	channel_id = get_channel_id(youtube, channel_url)

	final_videos = []
	for videos in fetch_channel_videos_by_id(api_key, channel_id, max_results_per_call):
	final_videos.extend(videos)
	logger.info("fetch_all_channel_videos: Fetched %d", len(final_videos))
	yield (f"Fetched {len(final_videos)}", videos) # <-- only yield the new batch

	yield (f"Fetched {len(final_videos)}", []) # final "summary"


	def fetch_channel_videos_by_id(api_key: str, channel_id: str, max_results=50):
	youtube = build("youtube", "v3", developerKey=api_key)

	# Get uploads playlist ID
	channel_response = youtube.channels().list(
	part="contentDetails,snippet", id=channel_id
	).execute()

	channel_title = channel_response["items"][0]["snippet"]["title"]
	uploads_playlist_id = channel_response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]

	next_page_token = None

	while True:
	request = youtube.playlistItems().list(
	part="snippet",
	playlistId=uploads_playlist_id,
	maxResults=max_results,
	pageToken=next_page_token,
	)
	response = request.execute()

	videos = []
	for item in response.get("items", []):
	snippet = item["snippet"]
	videos.append(
	{
	"video_id": snippet["resourceId"]["videoId"],
	"title": snippet["title"],
	"description": snippet.get("description", ""),
	"channel_id": channel_id,
	"channel_title": channel_title,
	}
	)

	yield videos # yield one page worth

	next_page_token = response.get("nextPageToken")
	if not next_page_token:
	break