227 lines
6.5 KiB
Python
227 lines
6.5 KiB
Python
"""
|
|
Lightweight helpers for gathering video metadata and transcripts from YouTube.
|
|
|
|
Usage:
|
|
python -m python_app.transcript_collector --channel UC123 --output data/raw
|
|
|
|
Relies on:
|
|
- YouTube Data API v3 (requires YOUTUBE_API_KEY).
|
|
- youtube-transcript-api for transcript retrieval.
|
|
Both libraries are optional at import time so the module can still be referenced
|
|
when only working with existing JSON dumps.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
from dataclasses import asdict, dataclass
|
|
from pathlib import Path
|
|
from typing import Dict, Iterable, Iterator, List, Optional
|
|
|
|
from .config import CONFIG
|
|
|
|
try:
|
|
from googleapiclient.discovery import build as build_youtube # type: ignore
|
|
except ImportError: # pragma: no cover - library optional
|
|
build_youtube = None
|
|
|
|
try:
|
|
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
|
except ImportError: # pragma: no cover - library optional
|
|
YouTubeTranscriptApi = None
|
|
|
|
|
|
LOGGER = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class TranscriptSegment:
|
|
start: float
|
|
duration: float
|
|
text: str
|
|
|
|
|
|
@dataclass
|
|
class VideoRecord:
|
|
video_id: str
|
|
channel_id: str
|
|
channel_title: str
|
|
title: str
|
|
description: str
|
|
published_at: str
|
|
url: str
|
|
transcript: List[TranscriptSegment]
|
|
|
|
|
|
def _ensure_youtube_client(api_key: Optional[str]):
|
|
if build_youtube is None:
|
|
raise RuntimeError(
|
|
"google-api-python-client not installed. "
|
|
"Install google-api-python-client to collect metadata."
|
|
)
|
|
if not api_key:
|
|
raise RuntimeError(
|
|
"Set YOUTUBE_API_KEY to collect metadata from YouTube."
|
|
)
|
|
return build_youtube("youtube", "v3", developerKey=api_key)
|
|
|
|
|
|
def _ensure_transcript_api():
|
|
if YouTubeTranscriptApi is None:
|
|
raise RuntimeError(
|
|
"youtube-transcript-api not installed. "
|
|
"Install youtube-transcript-api to fetch transcripts."
|
|
)
|
|
return YouTubeTranscriptApi()
|
|
|
|
|
|
def iter_channel_videos(
|
|
channel_id: str,
|
|
*,
|
|
api_key: Optional[str] = None,
|
|
max_pages: int = 10,
|
|
) -> Iterator[Dict]:
|
|
"""
|
|
Yield raw playlist items for the uploads playlist of the given channel.
|
|
|
|
Args:
|
|
channel_id: Target YouTube channel ID.
|
|
api_key: Explicit API key (defaults to config value).
|
|
max_pages: Hard cap on paginated playlist fetches to keep things simple.
|
|
"""
|
|
client = _ensure_youtube_client(api_key or CONFIG.youtube.api_key)
|
|
channels = (
|
|
client.channels().list(id=channel_id, part="contentDetails").execute()
|
|
)
|
|
items = channels.get("items", [])
|
|
if not items:
|
|
raise ValueError(f"Channel {channel_id} not found.")
|
|
uploads_playlist = (
|
|
items[0]
|
|
.get("contentDetails", {})
|
|
.get("relatedPlaylists", {})
|
|
.get("uploads")
|
|
)
|
|
if not uploads_playlist:
|
|
raise ValueError(f"Channel {channel_id} missing uploads playlist.")
|
|
|
|
request = client.playlistItems().list(
|
|
playlistId=uploads_playlist, part="snippet", maxResults=50
|
|
)
|
|
page = 0
|
|
while request and page < max_pages:
|
|
response = request.execute()
|
|
for item in response.get("items", []):
|
|
yield item
|
|
page += 1
|
|
request = client.playlistItems().list_next(request, response)
|
|
|
|
|
|
def fetch_transcript(
|
|
video_id: str, *, languages: Optional[Iterable[str]] = None
|
|
) -> List[TranscriptSegment]:
|
|
"""Return transcript segments for a video, if available."""
|
|
api = _ensure_transcript_api()
|
|
try:
|
|
transcripts = api.get_transcript(video_id, languages=languages)
|
|
except Exception as exc: # broad catch keeps draft simple
|
|
LOGGER.warning("Transcript unavailable for %s: %s", video_id, exc)
|
|
return []
|
|
return [
|
|
TranscriptSegment(
|
|
start=entry.get("start", 0.0),
|
|
duration=entry.get("duration", 0.0),
|
|
text=entry.get("text", ""),
|
|
)
|
|
for entry in transcripts
|
|
]
|
|
|
|
|
|
def collect_channel(
|
|
channel_id: str,
|
|
output_dir: Path,
|
|
*,
|
|
api_key: Optional[str] = None,
|
|
max_pages: int = 2,
|
|
languages: Optional[List[str]] = None,
|
|
) -> List[VideoRecord]:
|
|
"""
|
|
Collect metadata + transcripts for a channel and store as JSON files.
|
|
|
|
Returns the in-memory list to make it easy to chain into ingestion.
|
|
"""
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
videos: List[VideoRecord] = []
|
|
for item in iter_channel_videos(
|
|
channel_id, api_key=api_key, max_pages=max_pages
|
|
):
|
|
snippet = item.get("snippet", {})
|
|
video_id = snippet.get("resourceId", {}).get("videoId")
|
|
if not video_id:
|
|
continue
|
|
segments = fetch_transcript(video_id, languages=languages)
|
|
record = VideoRecord(
|
|
video_id=video_id,
|
|
channel_id=snippet.get("channelId", channel_id),
|
|
channel_title=snippet.get("channelTitle", ""),
|
|
title=snippet.get("title", ""),
|
|
description=snippet.get("description", ""),
|
|
published_at=snippet.get("publishedAt", ""),
|
|
url=f"https://www.youtube.com/watch?v={video_id}",
|
|
transcript=segments,
|
|
)
|
|
videos.append(record)
|
|
dest = output_dir / f"{video_id}.json"
|
|
with dest.open("w", encoding="utf-8") as handle:
|
|
json.dump(asdict(record), handle, ensure_ascii=False, indent=2)
|
|
LOGGER.info("Saved %s", dest)
|
|
return videos
|
|
|
|
|
|
def _build_parser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(
|
|
description="Collect channel transcripts into JSON files."
|
|
)
|
|
parser.add_argument(
|
|
"--channel",
|
|
required=True,
|
|
help="YouTube channel ID (e.g. UCXYZ).",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
default=Path("data/raw"),
|
|
help="Directory to write per-video JSON files.",
|
|
)
|
|
parser.add_argument(
|
|
"--max-pages",
|
|
type=int,
|
|
default=2,
|
|
help="Number of paginated channel pages to pull (50 videos per page).",
|
|
)
|
|
parser.add_argument(
|
|
"--language",
|
|
dest="languages",
|
|
action="append",
|
|
help="Preferred transcript languages (can be repeated).",
|
|
)
|
|
return parser
|
|
|
|
|
|
def main() -> None:
|
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|
args = _build_parser().parse_args()
|
|
collect_channel(
|
|
args.channel,
|
|
args.output,
|
|
max_pages=args.max_pages,
|
|
languages=args.languages,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__": # pragma: no cover
|
|
main()
|
|
|