Skip to main content
Glama
xiaohongshu_processor.py16.8 kB
import re import json import logging from typing import List, Optional, Tuple, Dict import requests # 专用于小红书页面抓取的 UA(桌面端优先,可避免强制 App 跳转) HEADERS_XHS_PC = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.9", "Referer": "https://www.xiaohongshu.com/", } # 备用:移动端 UA(个别情况下可尝试回退) HEADERS_XHS_MOBILE = { "User-Agent": ( "Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) " "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.9", "Referer": "https://www.xiaohongshu.com/", } logger = logging.getLogger(__name__) class XiaohongshuProcessor: """小红书视频解析器 功能: - 解析分享链接 HTML,提取视频候选直链 - 依据启发式规则挑选“无水印”版本 """ def __init__(self, timeout: int = 12): self.timeout = timeout @staticmethod def _extract_first_url(text: str) -> str: urls = re.findall( r"http[s]?://(?:[a-zA-Z0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F]{2}))+", text, ) if not urls: raise ValueError("未找到有效的小红书链接") return urls[0] @staticmethod def _extract_note_id_from_path(url: str) -> Optional[str]: # 形如 /explore/{note_id} m = re.search(r"/explore/([a-z0-9]+)", url, re.IGNORECASE) return m.group(1) if m else None @staticmethod def _extract_meta(content: str, name_or_property: str, key: str = "content") -> Optional[str]: # 同时兼容 name="og:video" 与 property="og:video" pattern = ( rf"<meta[^>]+(?:name|property)=[\"']{re.escape(name_or_property)}[\"'][^>]+{key}=[\"'](.*?)[\"']" ) m = re.search(pattern, content, re.IGNORECASE | re.DOTALL) return m.group(1) if m else None @staticmethod def _extract_all_video_src(content: str) -> List[str]: # 提取所有 <video src="..."></video> return list( {m.group(1) for m in re.finditer(r"<video[^>]+src=\"(.*?)\"", content, re.IGNORECASE)} ) @staticmethod def _score_candidate(url: str, source: str) -> int: """为候选直链打分,分数越高越优先。 规则依据本次样本与常见页面行为: - 优先来自 <video> DOM 的链接(更贴近实际播放流) - 偏好末尾为 _114.mp4 或路径中含 /114/ 的版本(样本中该版本为无水印) - 可扩展更多特征(如显式 wm 标识的负权重等) """ score = 0 if source == "video": score += 100 if re.search(r"/(114)/", url) or re.search(r"_114\.mp4($|\?)", url): score += 50 # 一些经验性负向特征(可按需扩展) if re.search(r"(?:wm|watermark)", url, re.IGNORECASE): score -= 40 return score @staticmethod def _extract_quality_code(url: str) -> Optional[int]: # 从路径或文件名中提取质量码:/.../<q>/... 或 ..._<q>.mp4 m = re.search(r"/([0-9]{2,4})/", url) if m: try: return int(m.group(1)) except ValueError: pass m = re.search(r"_([0-9]{2,4})\.mp4(?:$|\?)", url) if m: try: return int(m.group(1)) except ValueError: pass return None @staticmethod def _force_quality_url(url: str, target: str = "114") -> str: # 同时替换路径段与文件名中的质量码 url2 = re.sub(r"/([0-9]{2,4})/", f"/{target}/", url) url2 = re.sub(r"_([0-9]{2,4})\.mp4", f"_{target}.mp4", url2) return url2 @staticmethod def _prefer_hs_domain(url: str) -> str: # 将 sns-video-*.xhscdn.com 统一偏向 hs return re.sub(r"https://sns-video-[a-z]+\.xhscdn\.com", "https://sns-video-hs.xhscdn.com", url) def _probe_head_ok(self, url: str) -> bool: try: resp = requests.head(url, headers=HEADERS_XHS_PC, timeout=min(self.timeout, 6), allow_redirects=True) return 200 <= resp.status_code < 400 except Exception: return False def get_watermark_free_url(self, candidates: List[Tuple[str, str]]) -> str: if not candidates: raise ValueError("未从页面中发现可用视频直链") # 1) 先规范域名,提取质量码 normalized: List[Tuple[str, str, Optional[int]]] = [] for url, source in candidates: u = self._prefer_hs_domain(url) q = self._extract_quality_code(u) normalized.append((u, source, q)) # 2) 如果存在 114,直接返回 for u, source, q in normalized: if q == 114: logger.debug(f"[小红书视频] 找到114质量码视频(无水印)") return u # 3) 否则选一个最靠谱的候选,尝试强制改成 114 并 HEAD 探测 best = sorted( normalized, key=lambda item: (self._score_candidate(item[0], item[1]) * -1, len(item[0])), )[0] logger.debug(f"[小红书视频] 未找到114质量码,尝试转换URL(质量码: {best[2]})") forced = self._force_quality_url(best[0], "114") if forced != best[0] and self._probe_head_ok(forced): logger.debug(f"[小红书视频] URL转换成功,使用114版本") return forced # 4) 退回原始 best logger.debug(f"[小红书视频] URL转换失败,使用原始候选(质量码: {best[2]})") return best[0] def _fetch_html(self, url: str) -> str: # 先尝试桌面 UA resp = requests.get(url, headers=HEADERS_XHS_PC, timeout=self.timeout, allow_redirects=True) # 某些风控场景会返回 404 页,但仍含 SSR 内容;仅在完全失败时切换 UA if resp.status_code >= 500 or not resp.text: resp = requests.get(url, headers=HEADERS_XHS_MOBILE, timeout=self.timeout, allow_redirects=True) resp.raise_for_status() return resp.text def _extract_initial_state(self, html: str) -> Optional[dict]: """从 HTML 中提取 window.__INITIAL_STATE__ 数据""" pattern = r'<script>\s*window\.__INITIAL_STATE__\s*=\s*(\{.*?\})\s*(?:</script>|$)' match = re.search(pattern, html, re.DOTALL) if not match: return None json_str = match.group(1) # 处理 JavaScript 的 undefined 值(JSON 不支持) json_str = re.sub(r':\s*undefined\s*([,}])', r': null\1', json_str) try: return json.loads(json_str) except json.JSONDecodeError: return None @staticmethod def _ensure_https(url: str) -> str: """确保 URL 使用 HTTPS 协议 小红书 CDN 同时支持 HTTP 和 HTTPS,但为了避免混合内容问题, 统一使用 HTTPS(现代 Web 应用的最佳实践) """ if url and url.startswith('http://'): return url.replace('http://', 'https://', 1) return url @staticmethod def _convert_image_url_to_png(webp_url: str) -> Optional[str]: """将 WebP CDN 链接转换为 PNG 图片服务链接 借鉴油猴脚本 XHS-Downloader 的 URL 转换逻辑: 从 CDN URL 中提取图片 ID,转换为 ci.xiaohongshu.com 的 PNG 链接 示例转换: 输入: http://sns-webpic-qc.xhscdn.com/202510042121/15b1bc2cb.../1040g2sg31bs6p8sb0kdg5o3q72pg8rvklgbf230!nd_dft_wlteh_webp_3 输出: https://ci.xiaohongshu.com/1040g2sg31bs6p8sb0kdg5o3q72pg8rvklgbf230?imageView2/format/png 优势: - PNG 格式(无损)vs WebP - URL 更稳定,不依赖 CDN 节点和时间戳 - 使用小红书官方图片处理服务 """ # 提取图片 ID(感叹号前的部分)- 支持 http 和 https pattern = r'https?://sns-webpic-qc\.xhscdn\.com/\d+/[0-9a-z]+/(\S+?)!' match = re.search(pattern, webp_url) if match: image_id = match.group(1) # 转换为 ci.xiaohongshu.com 的 PNG 链接(强制 HTTPS) return f'https://ci.xiaohongshu.com/{image_id}?imageView2/format/png' return None def parse_image_note(self, share_text: str) -> Dict[str, any]: """解析小红书图文笔记,返回图片列表和笔记信息 返回格式: { "note_id": str, "title": str, "desc": str, "type": "image", "images": [ { "url_webp": str, # WebP 格式(体积小,适合预览) "url_png": str, # PNG 格式(无损高清,支持透明) "width": int, "height": int }, ... ] } """ import time start_time = time.time() share_url = self._extract_first_url(share_text) logger.debug(f"[小红书图文] 提取到的链接: {share_url}") # 先请求获取重定向后的真实 URL(短链接需要重定向) t1 = time.time() resp = requests.get(share_url, headers=HEADERS_XHS_PC, timeout=self.timeout, allow_redirects=True) logger.debug(f"[小红书图文] 页面请求耗时: {time.time()-t1:.2f}秒") final_url = resp.url html = resp.text # 从真实 URL 中提取 note_id(支持 /explore/ 和 /discovery/item/ 两种路径) note_match = re.search(r'/(?:explore|discovery/item)/([a-z0-9]+)', final_url) if not note_match: raise ValueError("无法从链接中提取笔记 ID") note_id = note_match.group(1) logger.debug(f"[小红书图文] Note ID: {note_id}") # 提取 __INITIAL_STATE__ 数据 t2 = time.time() data = self._extract_initial_state(html) if not data: raise ValueError("无法从页面中提取笔记数据") logger.debug(f"[小红书图文] JSON解析耗时: {time.time()-t2:.2f}秒") # 导航到笔记详情 try: note_map = data['note']['noteDetailMap'] if note_id not in note_map: raise KeyError(f"笔记 {note_id} 不在数据中") note_info = note_map[note_id]['note'] # 提取图片列表 image_list = note_info.get('imageList', []) if not image_list: raise ValueError("笔记中没有找到图片") # 处理图片 URL(同时提供 WebP 和 PNG 两种格式) t3 = time.time() images = [] for img in image_list: # 优先从 infoList 中选择 WB_DFT webp_url = None if 'infoList' in img: for info in img['infoList']: if info.get('imageScene') == 'WB_DFT': webp_url = info.get('url') break # 回退到 urlDefault if not webp_url: webp_url = img.get('urlDefault') if webp_url: # 确保 WebP URL 使用 HTTPS(避免混合内容问题) webp_url = self._ensure_https(webp_url) # 转换为 PNG URL(借鉴油猴脚本逻辑) png_url = self._convert_image_url_to_png(webp_url) # 同时保留两种格式 images.append({ 'url_webp': webp_url, # WebP 格式(体积小,适合预览)- HTTPS 'url_png': png_url if png_url else webp_url, # PNG 格式(无损高清)- HTTPS 'width': img.get('width'), 'height': img.get('height') }) # 清理标题中的非法字符 title = note_info.get('title', f'xhs_{note_id}') title = re.sub(r"[\\/:*?\"<>|]", "_", title).strip() logger.debug(f"[小红书图文] 图片URL处理耗时: {time.time()-t3:.2f}秒") total_time = time.time() - start_time logger.debug(f"[小红书图文] 解析完成,总耗时: {total_time:.2f}秒,图片数量: {len(images)}") logger.debug(f"{'='*60}\n") return { 'note_id': note_id, 'title': title, 'desc': note_info.get('desc', ''), 'type': 'image', 'images': images } except (KeyError, TypeError) as e: raise ValueError(f"解析笔记数据失败: {str(e)}") def parse_share_url(self, share_text: str) -> Dict[str, str]: """解析小红书分享链接,返回视频信息:url/title/note_id 解析策略: - 从 HTML 中抓取 <video src> 与 <meta og:video> - 通过启发式评分选择无水印直链 """ import time start_time = time.time() share_url = self._extract_first_url(share_text) logger.debug(f"[小红书视频] 提取到的链接: {share_url}") note_id = self._extract_note_id_from_path(share_url) t1 = time.time() html = self._fetch_html(share_url) logger.debug(f"[小红书视频] 页面请求耗时: {time.time()-t1:.2f}秒") # 标题:优先 og:title title = ( self._extract_meta(html, "og:title") or self._extract_meta(html, "og:description", key="content") or (f"xhs_{note_id}" if note_id else "xhs") ) # 清理非法文件名字符 title = re.sub(r"[\\/:*?\"<>|]", "_", title).strip() # 候选直链 candidates: List[Tuple[str, str]] = [] # 1) video 标签 for v in self._extract_all_video_src(html): candidates.append((v, "video")) # 2) og:video ogv = self._extract_meta(html, "og:video") if ogv: candidates.append((ogv, "og")) if not candidates: # 兜底:尝试在页面里扫所有以 xhscdn.com 结尾的 mp4 for m in re.finditer(r"https?://[^\s'\"]+?\.mp4", html, re.IGNORECASE): if "xhscdn.com" in m.group(0): candidates.append((m.group(0), "fallback")) logger.debug(f"[小红书视频] 找到 {len(candidates)} 个候选视频URL") t2 = time.time() final_url = self.get_watermark_free_url(candidates) logger.debug(f"[小红书视频] URL筛选耗时: {time.time()-t2:.2f}秒") total_time = time.time() - start_time logger.debug(f"[小红书视频] 解析完成,总耗时: {total_time:.2f}秒") logger.debug(f"{'='*60}\n") return { "url": final_url, "title": title, "note_id": note_id or "", } if __name__ == "__main__": # 便捷测试: # python -m douyin_mcp_server.xiaohongshu_processor "<xhs_url_or_text>" # python -m douyin_mcp_server.xiaohongshu_processor "<xhs_url_or_text>" --image import sys if len(sys.argv) < 2: print("用法: python -m douyin_mcp_server.xiaohongshu_processor <小红书链接或文本> [--image]") print(" 默认: 解析视频笔记") print(" --image: 解析图文笔记") sys.exit(1) share = sys.argv[1] is_image = "--image" in sys.argv p = XiaohongshuProcessor() if is_image: # 解析图文笔记 data = p.parse_image_note(share) print(f"\n{'='*60}") print(f"标题: {data['title']}") print(f"笔记 ID: {data['note_id']}") print(f"类型: {data['type']}") print(f"{'='*60}") print(f"\n正文内容:\n{data['desc']}") print(f"\n{'='*60}") print(f"图片数量: {len(data['images'])}\n") for i, img in enumerate(data['images'], 1): print(f"图片 {i}: {img['width']}x{img['height']}") print(f" WebP (轻量): {img['url_webp'][:80]}...") print(f" PNG (高清): {img['url_png']}\n") else: # 解析视频笔记 data = p.parse_share_url(share) print(json.dumps(data, ensure_ascii=False, indent=2))

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Ryan7t/wanyi-watermark'

If you have feedback or need assistance with the MCP directory API, please join our Discord server