Add full reference graph mode
Some checks failed
docker-build / build (push) Has been cancelled

This commit is contained in:
2025-11-19 15:23:21 -05:00
parent 7f74aaced8
commit 82c334b131
4 changed files with 301 additions and 25 deletions

View File

@@ -744,6 +744,159 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
},
}
def build_full_graph_payload(
max_nodes: int, *, highlight_id: Optional[str] = None
) -> Dict[str, Any]:
"""
Attempt to render the entire reference graph by gathering every video that
references another (or is referenced).
"""
query = {
"bool": {
"should": [
{"range": {"internal_references_count": {"gt": 0}}},
{"range": {"referenced_by_count": {"gt": 0}}},
{"exists": {"field": "internal_references"}},
{"exists": {"field": "referenced_by"}},
],
"minimum_should_match": 1,
}
}
source_fields = [
"video_id",
"title",
"channel_id",
"channel_name",
"url",
"date",
"internal_references",
"referenced_by",
]
nodes: Dict[str, Dict[str, Any]] = {}
links: List[Dict[str, Any]] = []
link_seen: Set[Tuple[str, str, str]] = set()
batch_size = min(500, max(50, max_nodes * 2))
truncated = False
def ensure_node(node_id: Optional[str], doc: Optional[Dict[str, Any]] = None) -> bool:
if not node_id:
return False
if node_id in nodes:
if doc:
existing = nodes[node_id]
if (not existing.get("title") or existing["title"] == node_id) and doc.get("title"):
existing["title"] = doc["title"]
if not existing.get("channel_id") and doc.get("channel_id"):
existing["channel_id"] = doc["channel_id"]
if (
existing.get("channel_name") in {"Unknown", node_id, None}
and (doc.get("channel_name") or doc.get("channel_id"))
):
existing["channel_name"] = doc.get("channel_name") or doc.get("channel_id")
if not existing.get("url") and doc.get("url"):
existing["url"] = doc.get("url")
if not existing.get("date") and doc.get("date"):
existing["date"] = doc.get("date")
return True
if len(nodes) >= max_nodes:
return False
channel_name = None
channel_id = None
url = None
date_val = None
title = node_id
if doc:
title = doc.get("title") or title
channel_id = doc.get("channel_id")
channel_name = doc.get("channel_name") or channel_id
url = doc.get("url")
date_val = doc.get("date")
nodes[node_id] = {
"id": node_id,
"title": title,
"channel_id": channel_id,
"channel_name": channel_name or "Unknown",
"url": url,
"date": date_val,
"is_root": False,
}
return True
scroll_id: Optional[str] = None
try:
body = {"query": query, "_source": source_fields, "sort": ["_doc"]}
response = client.search(
index=index, body=body, size=batch_size, scroll="1m"
)
scroll_id = response.get("_scroll_id")
stop_fetch = False
while not stop_fetch:
hits = response.get("hits", {}).get("hits", [])
if not hits:
break
for hit in hits:
if len(nodes) >= max_nodes:
truncated = True
stop_fetch = True
break
source = hit.get("_source", {}) or {}
video_id = source.get("video_id")
if not video_id:
continue
if not ensure_node(video_id, source):
continue
for target in normalize_reference_list(source.get("internal_references")):
if target == video_id:
continue
if not ensure_node(target):
continue
key = (video_id, target, "references")
if key not in link_seen:
links.append(
{"source": video_id, "target": target, "relation": "references"}
)
link_seen.add(key)
for origin in normalize_reference_list(source.get("referenced_by")):
if origin == video_id:
continue
if not ensure_node(origin):
continue
key = (origin, video_id, "referenced_by")
if key not in link_seen:
links.append(
{"source": origin, "target": video_id, "relation": "referenced_by"}
)
link_seen.add(key)
if stop_fetch or not scroll_id:
break
response = client.scroll(scroll_id=scroll_id, scroll="1m")
scroll_id = response.get("_scroll_id")
if not scroll_id:
break
finally:
if scroll_id:
try:
client.clear_scroll(scroll_id=scroll_id)
except Exception:
pass
if highlight_id and highlight_id in nodes:
nodes[highlight_id]["is_root"] = True
return {
"root": highlight_id or "",
"depth": 0,
"nodes": list(nodes.values()),
"links": links,
"meta": {
"node_count": len(nodes),
"link_count": len(links),
"mode": "full",
"truncated": truncated,
},
}
@app.route("/api/channels")
def channels():
include_external = request.args.get("external", default="0", type=str)
@@ -832,7 +985,9 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
@app.route("/api/graph")
def graph_api():
video_id = (request.args.get("video_id") or "").strip()
if not video_id:
full_graph = request.args.get("full_graph", default="0", type=str)
full_graph = full_graph.lower() in {"1", "true", "yes"}
if not full_graph and not video_id:
return jsonify({"error": "video_id is required"}), 400
try:
@@ -845,14 +1000,17 @@ def create_app(config: AppConfig = CONFIG) -> Flask:
max_nodes = int(request.args.get("max_nodes", "200"))
except ValueError:
max_nodes = 200
max_nodes = max(10, min(max_nodes, 400))
max_nodes = max(10, min(max_nodes, 800 if full_graph else 400))
payload = build_graph_payload(video_id, depth, max_nodes)
if not payload["nodes"]:
return (
jsonify({"error": f"Video '{video_id}' was not found in the index."}),
404,
)
if full_graph:
payload = build_full_graph_payload(max_nodes, highlight_id=video_id or None)
else:
payload = build_graph_payload(video_id, depth, max_nodes)
if not payload["nodes"]:
return (
jsonify({"error": f"Video '{video_id}' was not found in the index."}),
404,
)
payload["meta"]["max_nodes"] = max_nodes
return jsonify(payload)