Skip to main content
Glama
Ryan7t
by Ryan7t

extract_douyin_text

Extract text content from Douyin (TikTok) videos using share links, supporting optional speech recognition models for transcription.

Instructions

从抖音分享链接提取视频中的文本内容

参数:
- share_link: 抖音分享链接或包含链接的文本
- model: 语音识别模型(可选,默认使用paraformer-v2)

返回:
- 提取的文本内容

注意: 需要设置环境变量 DASHSCOPE_API_KEY

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
share_linkYes
modelNo

Output Schema

TableJSON Schema
NameRequiredDescriptionDefault
resultYes

Implementation Reference

  • The main async handler function for the 'extract_douyin_text' MCP tool. It decorates with @mcp.tool() for registration, defines input schema via type hints and docstring, parses the share link using DouyinProcessor, and calls extract_text_from_video_url on the video URL.
    @mcp.tool()
    async def extract_douyin_text(
        share_link: str,
        model: Optional[str] = None,
        ctx: Context = None
    ) -> str:
        """
        从抖音分享链接提取视频中的文本内容
    
        参数:
        - share_link: 抖音分享链接或包含链接的文本
        - model: 语音识别模型(可选,默认使用paraformer-v2)
    
        返回:
        - 提取的文本内容
    
        注意: 需要设置环境变量 DASHSCOPE_API_KEY
        """
        try:
            # 从环境变量获取API密钥
            api_key = os.getenv('DASHSCOPE_API_KEY')
            if not api_key:
                raise ValueError("未设置环境变量 DASHSCOPE_API_KEY,请在配置中添加阿里云百炼API密钥")
    
            processor = DouyinProcessor(api_key, model)
    
            # 解析视频链接
            ctx.info("正在解析抖音分享链接...")
            video_info = processor.parse_share_url(share_link)
    
            # 直接使用视频URL进行文本提取
            ctx.info("正在从视频中提取文本...")
            text_content = processor.extract_text_from_video_url(video_info['url'])
    
            ctx.info("文本提取完成!")
            return text_content
    
        except Exception as e:
            ctx.error(f"处理过程中出现错误: {str(e)}")
            raise Exception(f"提取抖音视频文本失败: {str(e)}")
  • Core helper function that performs the actual text extraction from the video URL using Dashscope's audio ASR Transcription API (async_call and wait). This is the key implementation logic for speech-to-text.
    def extract_text_from_video_url(self, video_url: str) -> str:
        """从视频URL中提取文字(使用阿里云百炼API)"""
        try:
            # 发起异步转录任务
            task_response = dashscope.audio.asr.Transcription.async_call(
                model=self.model,
                file_urls=[video_url],
                language_hints=['zh', 'en']
            )
    
            # 等待转录完成
            transcription_response = dashscope.audio.asr.Transcription.wait(
                task=task_response.output.task_id
            )
    
            if transcription_response.status_code == HTTPStatus.OK:
                # 获取转录结果
                for transcription in transcription_response.output['results']:
                    url = transcription['transcription_url']
                    result = json.loads(urlrequest.urlopen(url).read().decode('utf8'))
    
                    # 保存结果到临时文件
                    temp_json_path = self.temp_dir / 'transcription.json'
                    with open(temp_json_path, 'w') as f:
                        json.dump(result, f, indent=4, ensure_ascii=False)
    
                    # 提取文本内容
                    if 'transcripts' in result and len(result['transcripts']) > 0:
                        return result['transcripts'][0]['text']
                    else:
                        return "未识别到文本内容"
    
            else:
                raise Exception(f"转录失败: {transcription_response.output.message}")
    
        except Exception as e:
            raise Exception(f"提取文字时出错: {str(e)}")
  • Helper function in DouyinProcessor class that parses the Douyin share link, extracts video ID, scrapes the page for JSON data, and returns the no-watermark video URL along with caption and title. Called by the handler to obtain the video_url.
    def parse_share_url(self, share_text: str) -> Dict[str, str]:
        """从分享文本中提取无水印视频链接"""
        import time
        start_time = time.time()
    
        # 提取分享链接
        urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', share_text)
        if not urls:
            raise ValueError("未找到有效的分享链接")
    
        share_url = urls[0]
        logger.debug(f"[抖音视频] 提取到的链接: {share_url}")
    
        t1 = time.time()
        share_response = requests.get(share_url, headers=HEADERS, timeout=10, allow_redirects=True)
        logger.debug(f"[抖音视频] 短链接重定向耗时: {time.time()-t1:.2f}秒")
    
        video_id = share_response.url.split("?")[0].strip("/").split("/")[-1]
        logger.debug(f"[抖音视频] 视频ID: {video_id}")
    
        share_url = f'https://www.iesdouyin.com/share/video/{video_id}'
    
        # 获取视频页面内容
        t2 = time.time()
        response = requests.get(share_url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        logger.debug(f"[抖音视频] 页面请求耗时: {time.time()-t2:.2f}秒")
    
        pattern = re.compile(
            pattern=r"window\._ROUTER_DATA\s*=\s*(.*?)</script>",
            flags=re.DOTALL,
        )
        find_res = pattern.search(response.text)
    
        if not find_res or not find_res.group(1):
            raise ValueError("从HTML中解析视频信息失败")
    
        # 解析JSON数据
        json_data = json.loads(find_res.group(1).strip())
        VIDEO_ID_PAGE_KEY = "video_(id)/page"
        NOTE_ID_PAGE_KEY = "note_(id)/page"
    
        if VIDEO_ID_PAGE_KEY in json_data["loaderData"]:
            original_video_info = json_data["loaderData"][VIDEO_ID_PAGE_KEY]["videoInfoRes"]
        elif NOTE_ID_PAGE_KEY in json_data["loaderData"]:
            original_video_info = json_data["loaderData"][NOTE_ID_PAGE_KEY]["videoInfoRes"]
        else:
            raise Exception("无法从JSON中解析视频或图集信息")
    
        data = original_video_info["item_list"][0]
    
        # 检查是否为图文笔记
        if "images" in data and data["images"]:
            raise ValueError("这是图文笔记,请使用 parse_image_note 方法")
    
        # 检查是否有视频
        if "video" not in data or not data.get("video"):
            raise ValueError("未找到视频信息")
    
        # 获取视频信息(去水印:playwm -> play)
        video_url = data["video"]["play_addr"]["url_list"][0].replace("playwm", "play")
        raw_desc = data.get("desc", "").strip()
        if not raw_desc:
            raw_desc = f"douyin_{video_id}"
    
        # 替换文件名中的非法字符,仅用于文件命名
        safe_title = re.sub(r'[\\/:*?"<>|]', '_', raw_desc)
    
        total_time = time.time() - start_time
        logger.debug(f"[抖音视频] 解析完成,总耗时: {total_time:.2f}秒")
        logger.debug(f"{'='*60}\n")
    
        return {
            "url": video_url,
            "title": safe_title,
            "caption": raw_desc,
            "video_id": video_id
        }
  • Tool listing in the watermark_removal_guide prompt, describing the extract_douyin_text tool.
    ### 特殊功能
    - `extract_douyin_text`: 从抖音视频中提取语音文本内容(需要 API 密钥)

Tool Definition Quality

Score is being calculated. Check back soon.

Install Server

Other Tools

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Ryan7t/wanyi-watermark'

If you have feedback or need assistance with the MCP directory API, please join our Discord server