#!/usr/bin/env python3
"""
抖音无水印视频下载并提取文本的 MCP 服务器
该服务器提供以下功能:
1. 解析抖音分享链接获取无水印视频链接
2. 下载视频并提取音频
3. 从音频中提取文本内容
4. 自动清理中间文件
"""
import os
import re
import json
import requests
import tempfile
import asyncio
import toml
import argparse
from pathlib import Path
from typing import Optional, Tuple
import ffmpeg
from tqdm.asyncio import tqdm
from mcp.server.fastmcp import FastMCP
from mcp.server.fastmcp import Context
# 读取配置文件
def load_config():
"""从pyproject.toml加载配置"""
config_path = Path(__file__).parent.parent / "pyproject.toml"
if config_path.exists():
config = toml.load(config_path)
return config.get("tool", {}).get("douyin-mcp-server", {})
return {}
# 加载配置
CONFIG = load_config()
# 从配置文件或环境变量获取API密钥
DOUYIN_API_KEY = CONFIG.get("api_key","sk-xbpzfvwsqujhjqghfsxrdqopoiunhajxfqwvbyduynflnshy")
# 创建 MCP 服务器实例
mcp = FastMCP("Douyin MCP Server",
dependencies=["requests", "ffmpeg-python", "tqdm"])
# 请求头,模拟移动端访问
HEADERS = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) EdgiOS/121.0.2277.107 Version/17.0 Mobile/15E148 Safari/604.1'
}
# 从配置文件获取API配置,如果没有则使用默认值
DEFAULT_API_BASE_URL = CONFIG.get("api_base_url", "https://api.siliconflow.cn/v1/audio/transcriptions")
DEFAULT_MODEL = CONFIG.get("model", "FunAudioLLM/SenseVoiceSmall")
class DouyinProcessor:
"""抖音视频处理器"""
def __init__(self, api_key: str, api_base_url: Optional[str] = None, model: Optional[str] = None):
self.api_key = api_key
self.api_base_url = api_base_url or DEFAULT_API_BASE_URL
self.model = model or DEFAULT_MODEL
self.temp_dir = Path(tempfile.mkdtemp())
def __del__(self):
"""清理临时目录"""
import shutil
if hasattr(self, 'temp_dir') and self.temp_dir.exists():
shutil.rmtree(self.temp_dir, ignore_errors=True)
def parse_share_url(self, share_text: str) -> dict:
"""从分享文本中提取无水印视频链接"""
# 提取分享链接
urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', share_text)
if not urls:
raise ValueError("未找到有效的分享链接")
share_url = urls[0]
share_response = requests.get(share_url, headers=HEADERS)
video_id = share_response.url.split("?")[0].strip("/").split("/")[-1]
share_url = f'https://www.iesdouyin.com/share/video/{video_id}'
# 获取视频页面内容
response = requests.get(share_url, headers=HEADERS)
response.raise_for_status()
pattern = re.compile(
pattern=r"window\._ROUTER_DATA\s*=\s*(.*?)</script>",
flags=re.DOTALL,
)
find_res = pattern.search(response.text)
if not find_res or not find_res.group(1):
raise ValueError("从HTML中解析视频信息失败")
# 解析JSON数据
json_data = json.loads(find_res.group(1).strip())
VIDEO_ID_PAGE_KEY = "video_(id)/page"
NOTE_ID_PAGE_KEY = "note_(id)/page"
if VIDEO_ID_PAGE_KEY in json_data["loaderData"]:
original_video_info = json_data["loaderData"][VIDEO_ID_PAGE_KEY]["videoInfoRes"]
elif NOTE_ID_PAGE_KEY in json_data["loaderData"]:
original_video_info = json_data["loaderData"][NOTE_ID_PAGE_KEY]["videoInfoRes"]
else:
raise Exception("无法从JSON中解析视频或图集信息")
data = original_video_info["item_list"][0]
# 获取视频信息
video_url = data["video"]["play_addr"]["url_list"][0].replace("playwm", "play")
desc = data.get("desc", "").strip() or f"douyin_{video_id}"
# 替换文件名中的非法字符
desc = re.sub(r'[\\/:*?"<>|]', '_', desc)
return {
"url": video_url,
"title": desc,
"video_id": video_id
}
async def download_video(self, video_info: dict, ctx: Context) -> Path:
"""异步下载视频到临时目录"""
filename = f"{video_info['video_id']}.mp4"
filepath = self.temp_dir / filename
await ctx.info(f"正在下载视频: {video_info['title']}")
response = requests.get(video_info['url'], headers=HEADERS, stream=True)
response.raise_for_status()
# 获取文件大小
total_size = int(response.headers.get('content-length', 0))
# 异步下载文件,显示进度
with open(filepath, 'wb') as f:
downloaded = 0
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
if total_size > 0:
progress = downloaded / total_size
await ctx.report_progress(downloaded, total_size)
await ctx.info(f"视频下载完成: {filepath}")
return filepath
def extract_audio(self, video_path: Path) -> Path:
"""从视频文件中提取音频"""
audio_path = video_path.with_suffix('.mp3')
try:
(
ffmpeg
.input(str(video_path))
.output(str(audio_path), acodec='libmp3lame', q=0)
.run(capture_stdout=True, capture_stderr=True, overwrite_output=True)
)
return audio_path
except Exception as e:
raise Exception(f"提取音频时出错: {str(e)}")
def extract_text_from_audio(self, audio_path: Path) -> str:
"""从音频文件中提取文字"""
files = {
'file': (audio_path.name, open(audio_path, 'rb'), 'audio/mpeg'),
'model': (None, self.model)
}
headers = {
"Authorization": f"Bearer {self.api_key}"
}
try:
response = requests.post(self.api_base_url, files=files, headers=headers)
response.raise_for_status()
# 解析响应
result = response.json()
if 'text' in result:
return result['text']
else:
return response.text
except Exception as e:
raise Exception(f"提取文字时出错: {str(e)}")
finally:
files['file'][1].close()
def cleanup_files(self, *file_paths: Path):
"""清理指定的文件"""
for file_path in file_paths:
if file_path.exists():
file_path.unlink()
@mcp.tool()
def get_douyin_download_link(share_link: str) -> str:
"""
获取抖音视频的无水印下载链接
参数:
- share_link: 抖音分享链接或包含链接的文本
返回:
- 包含下载链接和视频信息的JSON字符串
"""
try:
processor = DouyinProcessor("") # 获取下载链接不需要API密钥
video_info = processor.parse_share_url(share_link)
return json.dumps({
"status": "success",
"video_id": video_info["video_id"],
"title": video_info["title"],
"download_url": video_info["url"],
"description": f"视频标题: {video_info['title']}",
"usage_tip": "可以直接使用此链接下载无水印视频"
}, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({
"status": "error",
"error": f"获取下载链接失败: {str(e)}"
}, ensure_ascii=False, indent=2)
@mcp.tool()
async def extract_douyin_text(
share_link: str,
api_base_url: Optional[str] = None,
model: Optional[str] = None,
ctx: Context = None
) -> str:
"""
从抖音分享链接提取视频中的文本内容
参数:
- share_link: 抖音分享链接或包含链接的文本
- api_base_url: API基础URL(可选,默认使用SiliconFlow)
- model: 语音识别模型(可选,默认使用SenseVoiceSmall)
返回:
- 提取的文本内容
注意: 需要设置环境变量 DOUYIN_API_KEY
"""
try:
# 从配置文件或环境变量获取API密钥
api_key = DOUYIN_API_KEY
if not api_key:
raise ValueError("未设置API密钥,请在pyproject.toml中配置api_key或设置环境变量DOUYIN_API_KEY")
processor = DouyinProcessor(api_key, api_base_url, model)
# 解析视频链接
await ctx.info("正在解析抖音分享链接...")
video_info = processor.parse_share_url(share_link)
# 下载视频
await ctx.info("正在下载视频...")
video_path = await processor.download_video(video_info, ctx)
# 提取音频
await ctx.info("正在提取音频...")
audio_path = processor.extract_audio(video_path)
# 提取文本
await ctx.info("正在从音频中提取文本...")
text_content = processor.extract_text_from_audio(audio_path)
# 清理临时文件
await ctx.info("正在清理临时文件...")
processor.cleanup_files(video_path, audio_path)
await ctx.info("文本提取完成!")
return text_content
except Exception as e:
await ctx.error(f"处理过程中出现错误: {str(e)}")
raise Exception(f"提取抖音视频文本失败: {str(e)}")
@mcp.tool()
def parse_douyin_video_info(share_link: str) -> str:
"""
解析抖音分享链接,获取视频基本信息
参数:
- share_link: 抖音分享链接或包含链接的文本
返回:
- 视频信息(JSON格式字符串)
"""
try:
processor = DouyinProcessor("") # 不需要API密钥来解析链接
video_info = processor.parse_share_url(share_link)
return json.dumps({
"video_id": video_info["video_id"],
"title": video_info["title"],
"download_url": video_info["url"],
"status": "success"
}, ensure_ascii=False, indent=2)
except Exception as e:
return json.dumps({
"status": "error",
"error": str(e)
}, ensure_ascii=False, indent=2)
@mcp.resource("douyin://video/{video_id}")
def get_video_info(video_id: str) -> str:
"""
获取指定视频ID的详细信息
参数:
- video_id: 抖音视频ID
返回:
- 视频详细信息
"""
share_url = f"https://www.iesdouyin.com/share/video/{video_id}"
try:
processor = DouyinProcessor("")
video_info = processor.parse_share_url(share_url)
return json.dumps(video_info, ensure_ascii=False, indent=2)
except Exception as e:
return f"获取视频信息失败: {str(e)}"
@mcp.prompt()
def douyin_text_extraction_guide() -> str:
"""抖音视频文本提取使用指南"""
return """
# 抖音视频文本提取使用指南
## 功能说明
这个MCP服务器可以从抖音分享链接中提取视频的文本内容,以及获取无水印下载链接。
## 环境变量配置
请确保设置了以下环境变量:
- `DOUYIN_API_KEY`: 语音识别API密钥(如SiliconFlow API密钥)
## 使用步骤
1. 复制抖音视频的分享链接
2. 在Claude Desktop配置中设置环境变量 DOUYIN_API_KEY
3. 使用相应的工具进行操作
## 工具说明
- `extract_douyin_text`: 完整的文本提取流程(需要API密钥)
- `get_douyin_download_link`: 获取无水印视频下载链接(无需API密钥)
- `parse_douyin_video_info`: 仅解析视频基本信息
- `douyin://video/{video_id}`: 获取指定视频的详细信息
## Claude Desktop 配置示例
```json
{
"mcpServers": {
"douyin-mcp": {
"command": "uvx",
"args": ["douyin-mcp-server"],
"env": {
"DOUYIN_API_KEY": "your-SiliconFlow-api-key-here"
}
}
}
}
```
## 注意事项
- 需要提供有效的API密钥(通过环境变量)
- 中间文件会自动清理
- 支持大部分抖音视频格式
- 获取下载链接无需API密钥
"""
def main(port: Optional[int] = None, host: Optional[str] = None, log_level: Optional[str] = None):
"""启动MCP服务器
参数:
- port: 服务器端口号,如果不提供则从配置文件获取
- host: 服务器主机地址,如果不提供则从配置文件获取
- log_level: 日志级别,如果不提供则从配置文件获取
"""
# 从配置文件获取服务器设置
server_config = CONFIG.get("server", {})
# 优先使用传入的参数,其次使用配置文件,最后使用默认值
mcp.settings.port = port or server_config.get("port", 8073)
mcp.settings.host = host or server_config.get("host", "0.0.0.0")
mcp.settings.log_level = log_level or server_config.get("log_level", "INFO")
print(f"启动抖音MCP服务器 - {mcp.settings.host}:{mcp.settings.port}")
print(f"API密钥配置: {'已配置' if DOUYIN_API_KEY else '未配置'}")
# 运行streamable-http transport
mcp.run(transport="streamable-http")
def parse_args():
"""解析命令行参数"""
parser = argparse.ArgumentParser(description="抖音MCP服务器")
parser.add_argument(
"--port",
type=int,
help="服务器端口号(默认从配置文件读取,配置文件未设置则为8073)"
)
parser.add_argument(
"--host",
type=str,
help="服务器主机地址(默认从配置文件读取,配置文件未设置则为0.0.0.0)"
)
parser.add_argument(
"--log-level",
type=str,
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
help="日志级别(默认从配置文件读取,配置文件未设置则为INFO)"
)
return parser.parse_args()
def main_cli():
"""命令行入口点函数,用于处理命令行参数"""
args = parse_args()
main(port=args.port, host=args.host, log_level=args.log_level)
if __name__ == "__main__":
main_cli()