""" Lightweight helpers for gathering video metadata and transcripts from YouTube. Usage: python -m python_app.transcript_collector --channel UC123 --output data/raw Relies on: - YouTube Data API v3 (requires YOUTUBE_API_KEY). - youtube-transcript-api for transcript retrieval. Both libraries are optional at import time so the module can still be referenced when only working with existing JSON dumps. """ from __future__ import annotations import argparse import json import logging from dataclasses import asdict, dataclass from pathlib import Path from typing import Dict, Iterable, Iterator, List, Optional from .config import CONFIG try: from googleapiclient.discovery import build as build_youtube # type: ignore except ImportError: # pragma: no cover - library optional build_youtube = None try: from youtube_transcript_api import YouTubeTranscriptApi # type: ignore except ImportError: # pragma: no cover - library optional YouTubeTranscriptApi = None LOGGER = logging.getLogger(__name__) @dataclass class TranscriptSegment: start: float duration: float text: str @dataclass class VideoRecord: video_id: str channel_id: str channel_title: str title: str description: str published_at: str url: str transcript: List[TranscriptSegment] def _ensure_youtube_client(api_key: Optional[str]): if build_youtube is None: raise RuntimeError( "google-api-python-client not installed. " "Install google-api-python-client to collect metadata." ) if not api_key: raise RuntimeError( "Set YOUTUBE_API_KEY to collect metadata from YouTube." ) return build_youtube("youtube", "v3", developerKey=api_key) def _ensure_transcript_api(): if YouTubeTranscriptApi is None: raise RuntimeError( "youtube-transcript-api not installed. " "Install youtube-transcript-api to fetch transcripts." ) return YouTubeTranscriptApi() def iter_channel_videos( channel_id: str, *, api_key: Optional[str] = None, max_pages: int = 10, ) -> Iterator[Dict]: """ Yield raw playlist items for the uploads playlist of the given channel. Args: channel_id: Target YouTube channel ID. api_key: Explicit API key (defaults to config value). max_pages: Hard cap on paginated playlist fetches to keep things simple. """ client = _ensure_youtube_client(api_key or CONFIG.youtube.api_key) channels = ( client.channels().list(id=channel_id, part="contentDetails").execute() ) items = channels.get("items", []) if not items: raise ValueError(f"Channel {channel_id} not found.") uploads_playlist = ( items[0] .get("contentDetails", {}) .get("relatedPlaylists", {}) .get("uploads") ) if not uploads_playlist: raise ValueError(f"Channel {channel_id} missing uploads playlist.") request = client.playlistItems().list( playlistId=uploads_playlist, part="snippet", maxResults=50 ) page = 0 while request and page < max_pages: response = request.execute() for item in response.get("items", []): yield item page += 1 request = client.playlistItems().list_next(request, response) def fetch_transcript( video_id: str, *, languages: Optional[Iterable[str]] = None ) -> List[TranscriptSegment]: """Return transcript segments for a video, if available.""" api = _ensure_transcript_api() try: transcripts = api.get_transcript(video_id, languages=languages) except Exception as exc: # broad catch keeps draft simple LOGGER.warning("Transcript unavailable for %s: %s", video_id, exc) return [] return [ TranscriptSegment( start=entry.get("start", 0.0), duration=entry.get("duration", 0.0), text=entry.get("text", ""), ) for entry in transcripts ] def collect_channel( channel_id: str, output_dir: Path, *, api_key: Optional[str] = None, max_pages: int = 2, languages: Optional[List[str]] = None, ) -> List[VideoRecord]: """ Collect metadata + transcripts for a channel and store as JSON files. Returns the in-memory list to make it easy to chain into ingestion. """ output_dir.mkdir(parents=True, exist_ok=True) videos: List[VideoRecord] = [] for item in iter_channel_videos( channel_id, api_key=api_key, max_pages=max_pages ): snippet = item.get("snippet", {}) video_id = snippet.get("resourceId", {}).get("videoId") if not video_id: continue segments = fetch_transcript(video_id, languages=languages) record = VideoRecord( video_id=video_id, channel_id=snippet.get("channelId", channel_id), channel_title=snippet.get("channelTitle", ""), title=snippet.get("title", ""), description=snippet.get("description", ""), published_at=snippet.get("publishedAt", ""), url=f"https://www.youtube.com/watch?v={video_id}", transcript=segments, ) videos.append(record) dest = output_dir / f"{video_id}.json" with dest.open("w", encoding="utf-8") as handle: json.dump(asdict(record), handle, ensure_ascii=False, indent=2) LOGGER.info("Saved %s", dest) return videos def _build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( description="Collect channel transcripts into JSON files." ) parser.add_argument( "--channel", required=True, help="YouTube channel ID (e.g. UCXYZ).", ) parser.add_argument( "--output", type=Path, default=Path("data/raw"), help="Directory to write per-video JSON files.", ) parser.add_argument( "--max-pages", type=int, default=2, help="Number of paginated channel pages to pull (50 videos per page).", ) parser.add_argument( "--language", dest="languages", action="append", help="Preferred transcript languages (can be repeated).", ) return parser def main() -> None: logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") args = _build_parser().parse_args() collect_channel( args.channel, args.output, max_pages=args.max_pages, languages=args.languages, ) if __name__ == "__main__": # pragma: no cover main()