diff --git a/search_app.py b/search_app.py index 1116dfc..eb056f0 100644 --- a/search_app.py +++ b/search_app.py @@ -744,6 +744,159 @@ def create_app(config: AppConfig = CONFIG) -> Flask: }, } + def build_full_graph_payload( + max_nodes: int, *, highlight_id: Optional[str] = None + ) -> Dict[str, Any]: + """ + Attempt to render the entire reference graph by gathering every video that + references another (or is referenced). + """ + + query = { + "bool": { + "should": [ + {"range": {"internal_references_count": {"gt": 0}}}, + {"range": {"referenced_by_count": {"gt": 0}}}, + {"exists": {"field": "internal_references"}}, + {"exists": {"field": "referenced_by"}}, + ], + "minimum_should_match": 1, + } + } + source_fields = [ + "video_id", + "title", + "channel_id", + "channel_name", + "url", + "date", + "internal_references", + "referenced_by", + ] + nodes: Dict[str, Dict[str, Any]] = {} + links: List[Dict[str, Any]] = [] + link_seen: Set[Tuple[str, str, str]] = set() + batch_size = min(500, max(50, max_nodes * 2)) + truncated = False + + def ensure_node(node_id: Optional[str], doc: Optional[Dict[str, Any]] = None) -> bool: + if not node_id: + return False + if node_id in nodes: + if doc: + existing = nodes[node_id] + if (not existing.get("title") or existing["title"] == node_id) and doc.get("title"): + existing["title"] = doc["title"] + if not existing.get("channel_id") and doc.get("channel_id"): + existing["channel_id"] = doc["channel_id"] + if ( + existing.get("channel_name") in {"Unknown", node_id, None} + and (doc.get("channel_name") or doc.get("channel_id")) + ): + existing["channel_name"] = doc.get("channel_name") or doc.get("channel_id") + if not existing.get("url") and doc.get("url"): + existing["url"] = doc.get("url") + if not existing.get("date") and doc.get("date"): + existing["date"] = doc.get("date") + return True + if len(nodes) >= max_nodes: + return False + channel_name = None + channel_id = None + url = None + date_val = None + title = node_id + if doc: + title = doc.get("title") or title + channel_id = doc.get("channel_id") + channel_name = doc.get("channel_name") or channel_id + url = doc.get("url") + date_val = doc.get("date") + nodes[node_id] = { + "id": node_id, + "title": title, + "channel_id": channel_id, + "channel_name": channel_name or "Unknown", + "url": url, + "date": date_val, + "is_root": False, + } + return True + + scroll_id: Optional[str] = None + try: + body = {"query": query, "_source": source_fields, "sort": ["_doc"]} + response = client.search( + index=index, body=body, size=batch_size, scroll="1m" + ) + scroll_id = response.get("_scroll_id") + stop_fetch = False + while not stop_fetch: + hits = response.get("hits", {}).get("hits", []) + if not hits: + break + for hit in hits: + if len(nodes) >= max_nodes: + truncated = True + stop_fetch = True + break + source = hit.get("_source", {}) or {} + video_id = source.get("video_id") + if not video_id: + continue + if not ensure_node(video_id, source): + continue + for target in normalize_reference_list(source.get("internal_references")): + if target == video_id: + continue + if not ensure_node(target): + continue + key = (video_id, target, "references") + if key not in link_seen: + links.append( + {"source": video_id, "target": target, "relation": "references"} + ) + link_seen.add(key) + for origin in normalize_reference_list(source.get("referenced_by")): + if origin == video_id: + continue + if not ensure_node(origin): + continue + key = (origin, video_id, "referenced_by") + if key not in link_seen: + links.append( + {"source": origin, "target": video_id, "relation": "referenced_by"} + ) + link_seen.add(key) + if stop_fetch or not scroll_id: + break + response = client.scroll(scroll_id=scroll_id, scroll="1m") + scroll_id = response.get("_scroll_id") + if not scroll_id: + break + finally: + if scroll_id: + try: + client.clear_scroll(scroll_id=scroll_id) + except Exception: + pass + + if highlight_id and highlight_id in nodes: + nodes[highlight_id]["is_root"] = True + + return { + "root": highlight_id or "", + "depth": 0, + "nodes": list(nodes.values()), + "links": links, + "meta": { + "node_count": len(nodes), + "link_count": len(links), + "mode": "full", + "truncated": truncated, + }, + } + @app.route("/api/channels") def channels(): include_external = request.args.get("external", default="0", type=str) @@ -832,7 +985,9 @@ def create_app(config: AppConfig = CONFIG) -> Flask: @app.route("/api/graph") def graph_api(): video_id = (request.args.get("video_id") or "").strip() - if not video_id: + full_graph = request.args.get("full_graph", default="0", type=str) + full_graph = full_graph.lower() in {"1", "true", "yes"} + if not full_graph and not video_id: return jsonify({"error": "video_id is required"}), 400 try: @@ -845,14 +1000,17 @@ def create_app(config: AppConfig = CONFIG) -> Flask: max_nodes = int(request.args.get("max_nodes", "200")) except ValueError: max_nodes = 200 - max_nodes = max(10, min(max_nodes, 400)) + max_nodes = max(10, min(max_nodes, 800 if full_graph else 400)) - payload = build_graph_payload(video_id, depth, max_nodes) - if not payload["nodes"]: - return ( - jsonify({"error": f"Video '{video_id}' was not found in the index."}), - 404, - ) + if full_graph: + payload = build_full_graph_payload(max_nodes, highlight_id=video_id or None) + else: + payload = build_graph_payload(video_id, depth, max_nodes) + if not payload["nodes"]: + return ( + jsonify({"error": f"Video '{video_id}' was not found in the index."}), + 404, + ) payload["meta"]["max_nodes"] = max_nodes return jsonify(payload) diff --git a/static/graph.html b/static/graph.html index 597b7dd..f8b03be 100644 --- a/static/graph.html +++ b/static/graph.html @@ -54,6 +54,17 @@ +
+ +

+ Includes every video that references another (ignores depth; may be slow). Max nodes still + applies. +

+
+