douyin.py•4.58 kB
import json
import re
import httpx
from .base import BaseParser, ImgInfo, VideoAuthor, VideoInfo
# 模拟手机端请求头
header = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) EdgiOS/121.0.2277.107 Version/17.0 Mobile/15E148 Safari/604.1'
}
class DouYin(BaseParser):
"""
抖音 / 抖音火山版
"""
async def parse_share_url(self, share_text: str) -> VideoInfo:
"""从分享文本中提取无水印视频链接"""
# 提取分享链接
urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', share_text)
if not urls:
raise ValueError("未找到有效的分享链接")
share_url = urls[0]
# 获取video_id
async with httpx.AsyncClient(follow_redirects=True) as client:
share_response = await client.get(share_url, headers=header)
video_id = str(share_response.url).split("?")[0].strip("/").split("/")[-1]
detail_url = f'https://www.iesdouyin.com/share/video/{video_id}'
response = await client.get(detail_url, headers=header)
response.raise_for_status()
pattern = re.compile(
pattern=r"window\._ROUTER_DATA\s*=\s*(.*?)</script>",
flags=re.DOTALL,
)
find_res = pattern.search(response.text)
if not find_res or not find_res.group(1):
raise ValueError("从HTML中解析视频信息失败")
# 解析JSON数据
json_data = json.loads(find_res.group(1).strip())
# 获取链接返回json数据进行视频和图集判断,如果指定类型不存在,抛出异常
# 返回的json数据中,视频字典类型为 video_(id)/page
VIDEO_ID_PAGE_KEY = "video_(id)/page"
# 返回的json数据中,视频字典类型为 note_(id)/page
NOTE_ID_PAGE_KEY = "note_(id)/page"
if VIDEO_ID_PAGE_KEY in json_data["loaderData"]:
original_video_info = json_data["loaderData"][VIDEO_ID_PAGE_KEY]["videoInfoRes"]
elif NOTE_ID_PAGE_KEY in json_data["loaderData"]:
original_video_info = json_data["loaderData"][NOTE_ID_PAGE_KEY]["videoInfoRes"]
else:
raise Exception("无法从JSON中解析视频或图集信息")
data = original_video_info["item_list"][0]
# 获取图集图片地址
images = []
# 如果data含有 images,并且 images 是一个列表
if "images" in data and isinstance(data["images"], list):
# 获取每个图片的url_list中的第一个元素,非空时添加到images列表中
for img in data["images"]:
if (
"url_list" in img
and isinstance(img["url_list"], list)
and len(img["url_list"]) > 0
and len(img["url_list"][0]) > 0
):
images.append(ImgInfo(url=img["url_list"][0]))
# 获取视频播放地址(snssdk.com直链)
video_url = data["video"]["play_addr"]["url_list"][0].replace("playwm", "play")
# 如果图集地址不为空时,因为没有视频,上面抖音返回的视频地址无法访问,置空处理
if len(images) > 0:
video_url = ""
# 组装VideoInfo
video_info = VideoInfo(
video_url=video_url,
cover_url=data["video"]["cover"]["url_list"][0],
title=data.get("desc", "").strip() or f"douyin_{video_id}",
images=images,
author=VideoAuthor(
uid=data["author"]["sec_uid"],
name=data["author"]["nickname"],
avatar=data["author"]["avatar_thumb"]["url_list"][0],
),
)
return video_info
async def get_video_redirect_url(self, video_url: str) -> str:
async with httpx.AsyncClient(follow_redirects=False) as client:
response = await client.get(video_url, headers=header)
# 返回重定向后的地址,如果没有重定向则返回原地址(抖音中的西瓜视频,重定向地址为空)
return response.headers.get("location") or video_url
async def parse_video_id(self, video_id: str) -> VideoInfo:
req_url = self._get_request_url_by_video_id(video_id)
return await self.parse_share_url(req_url)
def _get_request_url_by_video_id(self, video_id) -> str:
return f"https://www.iesdouyin.com/share/video/{video_id}/"