MCP YouTube Intelligence

test_textrank_compare.py•7.55 KiB

"""TextRank vs TF-IDF extractive summary 비교 테스트""" import re import math from collections import Counter # ── 공통 유틸 ── _MUSIC_RE = re.compile(r"\[?[♪♫♬]+\]?") _STOPWORDS = frozenset( "the a an is are was were be been being have has had do does did will would " "shall should may might can could of in to for on with at by from as into " "through during before after above below between and but or nor not so yet " "this that these those it its he she they we you i me him her us them my " "his our your their what which who whom how when where why all each every " "both few more most other some such no any if than too very just about also " "then only still even because since while although though after before until " "은 는 이 가 을 를 에 에서 의 와 과 도 로 으로 한 된 하는 있는 없는 그 이 저 것 수 등 " "좀 잘 더 또 안 못 제 너 나 요 네 거 건 게 데 때 곳 중 다 해 줘 줄 걸 뭐 왜".split() ) _IMPORTANCE_RE = re.compile( r"\b(결론|핵심|요약하면|정리하면|요점|중요한|" r"in summary|to summarize|the key point|importantly|in conclusion|" r"takeaway|bottom line|to recap|the main|crucial|essential)\b", re.IGNORECASE, ) def clean(text): text = _MUSIC_RE.sub("", text) return re.sub(r"\s{2,}", " ", text).strip() def split_sentences(text): # 마침표/느낌표/물음표 + 공백, 또는 줄바꿈 parts = re.split(r"(?<=[.!?。])\s+|\n+", text) return [p.strip() for p in parts if p and len(p.strip()) > 15] def tokenize(text): return [w.lower() for w in re.findall(r"[a-zA-Z가-힣\d]+", text) if len(w) > 1] def adaptive_max_chars(text_len): if text_len < 1000: return max(200, int(text_len * 0.50)) if text_len < 5000: return max(200, int(text_len * 0.20)) if text_len < 20000: return max(200, int(text_len * 0.10)) return max(200, int(text_len * 0.05)) # ── TextRank 구현 ── def _cosine_similarity(a_tokens, b_tokens): """두 문장의 토큰 간 코사인 유사도""" a_filtered = [t for t in a_tokens if t not in _STOPWORDS] b_filtered = [t for t in b_tokens if t not in _STOPWORDS] if not a_filtered or not b_filtered: return 0.0 a_counter = Counter(a_filtered) b_counter = Counter(b_filtered) all_words = set(a_counter) | set(b_counter) dot = sum(a_counter.get(w, 0) * b_counter.get(w, 0) for w in all_words) mag_a = math.sqrt(sum(v**2 for v in a_counter.values())) mag_b = math.sqrt(sum(v**2 for v in b_counter.values())) if mag_a == 0 or mag_b == 0: return 0.0 return dot / (mag_a * mag_b) def textrank_summary(text, max_sentences=7, max_chars=0): """TextRank 기반 요약: 문장 간 유사도 그래프 → PageRank → 상위 문장 선택""" text = clean(text) if not text: return "" if max_chars <= 0: max_chars = adaptive_max_chars(len(text)) sentences = split_sentences(text) sentences = [s for s in sentences if len(s.strip()) > 20] if len(sentences) <= max_sentences: result = ". ".join(sentences) return result[:max_chars] n = len(sentences) # 1. 토큰화 sent_tokens = [tokenize(s) for s in sentences] # 2. 유사도 행렬 구축 similarity_matrix = [[0.0] * n for _ in range(n)] for i in range(n): for j in range(i + 1, n): sim = _cosine_similarity(sent_tokens[i], sent_tokens[j]) similarity_matrix[i][j] = sim similarity_matrix[j][i] = sim # 3. PageRank 반복 (damping=0.85, 30회) damping = 0.85 scores = [1.0 / n] * n for _ in range(30): new_scores = [0.0] * n for i in range(n): rank_sum = 0.0 for j in range(n): if i == j: continue # j에서 나가는 링크의 합 out_sum = sum(similarity_matrix[j][k] for k in range(n) if k != j) if out_sum > 0: rank_sum += similarity_matrix[j][i] * scores[j] / out_sum new_scores[i] = (1 - damping) / n + damping * rank_sum scores = new_scores # 4. 위치 + 키워드 보너스 for i in range(n): if i == 0: scores[i] *= 1.3 elif i == n - 1: scores[i] *= 1.2 if _IMPORTANCE_RE.search(sentences[i]): scores[i] *= 1.5 # 5. 상위 문장 선택 (중복 제거) ranked = sorted(range(n), key=lambda i: scores[i], reverse=True) selected_indices = [] for idx in ranked: if len(selected_indices) >= max_sentences: break # 중복 체크 (Jaccard > 0.5) is_dup = False for sel_idx in selected_indices: ta = set(sent_tokens[idx]) - _STOPWORDS tb = set(sent_tokens[sel_idx]) - _STOPWORDS if ta and tb and len(ta & tb) / len(ta | tb) > 0.5: is_dup = True break if not is_dup: selected_indices.append(idx) # 6. 원래 순서로 정렬 selected_indices.sort() # 7. max_chars 내에서 조합 parts = [] total = 0 for idx in selected_indices: s = sentences[idx] addition = len(s) + (2 if parts else 0) if total + addition > max_chars and parts: break parts.append(s) total += addition result = ". ".join(parts) if result and not result.endswith((".", "!", "?")): result += "." return result # ── 기존 TF-IDF (현재 코드 그대로) ── def tfidf_summary(text, max_sentences=7, max_chars=0): """현재 extractive_summary와 동일한 로직""" from mcp_youtube_intelligence.core.summarizer import extractive_summary return extractive_summary(text, max_sentences=max_sentences, max_chars=max_chars) # ── 비교 테스트 실행 ── if __name__ == "__main__": from mcp_youtube_intelligence.core.transcript import fetch_transcript videos = [ ("kCc8FmEb1nY", "Let's build GPT (Karpathy, 영어 긴 영상)"), ("pBy1zgt0XPc", "한국어 짧은 영상"), ("aircAruvnKk", "한국어 중간 영상"), ] for vid, desc in videos: print(f"\n{'='*80}") print(f"📹 {desc} ({vid})") print(f"{'='*80}") result = fetch_transcript(vid) raw = result['best'] if not raw: print(" ❌ 자막 없음, 스킵") continue print(f" 📝 원본: {len(raw):,} chars | 언어: {result['lang']}") print(f" 📏 목표 요약 길이: {adaptive_max_chars(len(raw))} chars") # TF-IDF 요약 tfidf = tfidf_summary(raw) print(f"\n ── TF-IDF (현재) ──") print(f" 길이: {len(tfidf)} chars ({len(tfidf)*100//len(raw)}% of original)") print(f" 내용: {tfidf[:400]}") # TextRank 요약 tr = textrank_summary(raw) print(f"\n ── TextRank (신규) ──") print(f" 길이: {len(tr)} chars ({len(tr)*100//len(raw)}% of original)") print(f" 내용: {tr[:400]}") # 비교 분석 print(f"\n ── 비교 ──") # 겹치는 문장 확인 tfidf_sents = set(tfidf.split(". ")) tr_sents = set(tr.split(". ")) overlap = tfidf_sents & tr_sents print(f" 겹치는 문장: {len(overlap)}/{max(len(tfidf_sents), len(tr_sents))}") print(f" TF-IDF 고유 문장 수: {len(tfidf_sents - tr_sents)}") print(f" TextRank 고유 문장 수: {len(tr_sents - tfidf_sents)}")

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/JangHyuckYun/mcp-youtube-intelligence'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_textrank_compare.py•7.55 KiB