TLC-Search/transcript_collector.py
2025-11-02 01:14:36 -04:00

227 lines
6.5 KiB
Python

"""
Lightweight helpers for gathering video metadata and transcripts from YouTube.
Usage:
python -m python_app.transcript_collector --channel UC123 --output data/raw
Relies on:
- YouTube Data API v3 (requires YOUTUBE_API_KEY).
- youtube-transcript-api for transcript retrieval.
Both libraries are optional at import time so the module can still be referenced
when only working with existing JSON dumps.
"""
from __future__ import annotations
import argparse
import json
import logging
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Dict, Iterable, Iterator, List, Optional
from .config import CONFIG
try:
from googleapiclient.discovery import build as build_youtube # type: ignore
except ImportError: # pragma: no cover - library optional
build_youtube = None
try:
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
except ImportError: # pragma: no cover - library optional
YouTubeTranscriptApi = None
LOGGER = logging.getLogger(__name__)
@dataclass
class TranscriptSegment:
start: float
duration: float
text: str
@dataclass
class VideoRecord:
video_id: str
channel_id: str
channel_title: str
title: str
description: str
published_at: str
url: str
transcript: List[TranscriptSegment]
def _ensure_youtube_client(api_key: Optional[str]):
if build_youtube is None:
raise RuntimeError(
"google-api-python-client not installed. "
"Install google-api-python-client to collect metadata."
)
if not api_key:
raise RuntimeError(
"Set YOUTUBE_API_KEY to collect metadata from YouTube."
)
return build_youtube("youtube", "v3", developerKey=api_key)
def _ensure_transcript_api():
if YouTubeTranscriptApi is None:
raise RuntimeError(
"youtube-transcript-api not installed. "
"Install youtube-transcript-api to fetch transcripts."
)
return YouTubeTranscriptApi()
def iter_channel_videos(
channel_id: str,
*,
api_key: Optional[str] = None,
max_pages: int = 10,
) -> Iterator[Dict]:
"""
Yield raw playlist items for the uploads playlist of the given channel.
Args:
channel_id: Target YouTube channel ID.
api_key: Explicit API key (defaults to config value).
max_pages: Hard cap on paginated playlist fetches to keep things simple.
"""
client = _ensure_youtube_client(api_key or CONFIG.youtube.api_key)
channels = (
client.channels().list(id=channel_id, part="contentDetails").execute()
)
items = channels.get("items", [])
if not items:
raise ValueError(f"Channel {channel_id} not found.")
uploads_playlist = (
items[0]
.get("contentDetails", {})
.get("relatedPlaylists", {})
.get("uploads")
)
if not uploads_playlist:
raise ValueError(f"Channel {channel_id} missing uploads playlist.")
request = client.playlistItems().list(
playlistId=uploads_playlist, part="snippet", maxResults=50
)
page = 0
while request and page < max_pages:
response = request.execute()
for item in response.get("items", []):
yield item
page += 1
request = client.playlistItems().list_next(request, response)
def fetch_transcript(
video_id: str, *, languages: Optional[Iterable[str]] = None
) -> List[TranscriptSegment]:
"""Return transcript segments for a video, if available."""
api = _ensure_transcript_api()
try:
transcripts = api.get_transcript(video_id, languages=languages)
except Exception as exc: # broad catch keeps draft simple
LOGGER.warning("Transcript unavailable for %s: %s", video_id, exc)
return []
return [
TranscriptSegment(
start=entry.get("start", 0.0),
duration=entry.get("duration", 0.0),
text=entry.get("text", ""),
)
for entry in transcripts
]
def collect_channel(
channel_id: str,
output_dir: Path,
*,
api_key: Optional[str] = None,
max_pages: int = 2,
languages: Optional[List[str]] = None,
) -> List[VideoRecord]:
"""
Collect metadata + transcripts for a channel and store as JSON files.
Returns the in-memory list to make it easy to chain into ingestion.
"""
output_dir.mkdir(parents=True, exist_ok=True)
videos: List[VideoRecord] = []
for item in iter_channel_videos(
channel_id, api_key=api_key, max_pages=max_pages
):
snippet = item.get("snippet", {})
video_id = snippet.get("resourceId", {}).get("videoId")
if not video_id:
continue
segments = fetch_transcript(video_id, languages=languages)
record = VideoRecord(
video_id=video_id,
channel_id=snippet.get("channelId", channel_id),
channel_title=snippet.get("channelTitle", ""),
title=snippet.get("title", ""),
description=snippet.get("description", ""),
published_at=snippet.get("publishedAt", ""),
url=f"https://www.youtube.com/watch?v={video_id}",
transcript=segments,
)
videos.append(record)
dest = output_dir / f"{video_id}.json"
with dest.open("w", encoding="utf-8") as handle:
json.dump(asdict(record), handle, ensure_ascii=False, indent=2)
LOGGER.info("Saved %s", dest)
return videos
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Collect channel transcripts into JSON files."
)
parser.add_argument(
"--channel",
required=True,
help="YouTube channel ID (e.g. UCXYZ).",
)
parser.add_argument(
"--output",
type=Path,
default=Path("data/raw"),
help="Directory to write per-video JSON files.",
)
parser.add_argument(
"--max-pages",
type=int,
default=2,
help="Number of paginated channel pages to pull (50 videos per page).",
)
parser.add_argument(
"--language",
dest="languages",
action="append",
help="Preferred transcript languages (can be repeated).",
)
return parser
def main() -> None:
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
args = _build_parser().parse_args()
collect_channel(
args.channel,
args.output,
max_pages=args.max_pages,
languages=args.languages,
)
if __name__ == "__main__": # pragma: no cover
main()