Skip to main content
Glama
crawler.py3.31 kB
import asyncio import datetime from base import AbstractCrawler from config import CNBLOGS_MT_KEYWORDS from extension.cnblogs.client import CnBlogsMetaBlogClient from extension.crawler_factory import get_crawler_setup_source from utils import logger from tenacity import retry, stop_after_attempt, wait_fixed, RetryError class CnBlogsCrawler(AbstractCrawler): def __init__(self): self.type_crawler = "CnBlogs Crawler" self.domain_crawler = ".cnblogs.com" self._cnBlogsMetaBlogClient = CnBlogsMetaBlogClient() self.post_data = None async def article_path_proc(self, file_name: str, md_content: str): return { "dateCreated": datetime.datetime.now(), "title": file_name, "description": md_content, "categories": ["[Markdown]"], # 设置默认为markdown编辑器 "mt_keywords": CNBLOGS_MT_KEYWORDS # 标签 } async def init_config(self, file_name: str, md_content: str, image_results): logger.info(f"[{self.type_crawler}] Start initializing and processing image links.") results = await self.image_process(image_results) if results: for image_path in results: if image_path and "old_image_url" in image_path and "new_image_url" in image_path: md_content = md_content.replace(image_path["old_image_url"], image_path["new_image_url"]) self.post_data = await self.article_path_proc(file_name, md_content) async def run(self): logger.info(f'[{self.type_crawler}] Start publishing articles.') try: post_id = await self.new_post() # 发布文章返回的文章id return {'result': AbstractCrawler.SUCCESS_RESULT} except RetryError as e: logger.info(f'[{self.type_crawler}] Failure to publish the article! Cause of error:{e.last_attempt.exception()}') # 获取最后一次重试的异常 return {'result': AbstractCrawler.FAILURE_RESULT} @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) async def new_post(self): try: return self._cnBlogsMetaBlogClient.new_post(self.post_data, True) # 返回博文ID except Exception as e: raise Exception(e) async def image_process(self, image_results): if image_results: tasks = [self.new_media_object(image_result["image_url"], image_result["image_content"]) for image_result in image_results if image_result] results = await asyncio.gather(*tasks) return results async def new_media_object(self, image_url, image_content): for i in range(3): try: new_image_url = self._cnBlogsMetaBlogClient.new_media_object({ "bits": image_content, "name": "abc.png", "type": "image/png" }) return {"old_image_url": image_url, "new_image_url": new_image_url.get("url")} except Exception as e: logger.error(f"[{self.type_crawler}] Failed to request image link!- {e}") async def login_as(self): get_crawler_setup_source().update({"cnblogs": self._cnBlogsMetaBlogClient.config is not None})

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Cyanty/Arcs-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server