Scrapling Fetch MCP

by cyberchitta
Verified
# from https://github.com/microsoft/markitdown/blob/main/packages/markitdown/src/markitdown/converters/_markdownify.py from re import search from typing import Any, Optional from urllib.parse import quote, unquote, urlparse, urlunparse from markdownify import ATX, MarkdownConverter, chomp class _CustomMarkdownify(MarkdownConverter): def __init__(self, **options: Any): options["heading_style"] = options.get("heading_style", ATX) super().__init__(**options) def convert_hn( self, n: int, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs, ) -> str: if not convert_as_inline: if not search(r"^\n", text): return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore return super().convert_hn(n, el, text, convert_as_inline) # type: ignore def convert_a( self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs, ): prefix, suffix, text = chomp(text) # type: ignore if not text: return "" if el.find_parent("pre") is not None: return text href = el.get("href") title = el.get("title") if href: try: parsed_url = urlparse(href) # type: ignore if parsed_url.scheme and parsed_url.scheme.lower() not in [ "http", "https", "file", ]: # type: ignore return "%s%s%s" % (prefix, text, suffix) href = urlunparse( parsed_url._replace(path=quote(unquote(parsed_url.path))) ) # type: ignore except ValueError: return "%s%s%s" % (prefix, text, suffix) if ( self.options["autolinks"] and text.replace(r"\_", "_") == href and not title and not self.options["default_title"] ): return "<%s>" % href if self.options["default_title"] and not title: title = href title_part = ' "%s"' % title.replace('"', r"\"") if title else "" return ( "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text ) def convert_img( self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs, ) -> str: alt = el.attrs.get("alt", None) or "" src = el.attrs.get("src", None) or "" title = el.attrs.get("title", None) or "" title_part = ' "%s"' % title.replace('"', r"\"") if title else "" if ( convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"] ): return alt if src.startswith("data:"): src = src.split(",")[0] + "..." return "![%s](%s%s)" % (alt, src, title_part) def convert_soup(self, soup: Any) -> str: return super().convert_soup(soup) # type: ignore