Scrapling Fetch MCP
by cyberchitta
Verified
- scrapling-fetch-mcp
- src
- scrapling_fetch_mcp
# from https://github.com/microsoft/markitdown/blob/main/packages/markitdown/src/markitdown/converters/_markdownify.py
from re import search
from typing import Any, Optional
from urllib.parse import quote, unquote, urlparse, urlunparse
from markdownify import ATX, MarkdownConverter, chomp
class _CustomMarkdownify(MarkdownConverter):
def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", ATX)
super().__init__(**options)
def convert_hn(
self,
n: int,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
if not convert_as_inline:
if not search(r"^\n", text):
return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
def convert_a(
self,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
):
prefix, suffix, text = chomp(text) # type: ignore
if not text:
return ""
if el.find_parent("pre") is not None:
return text
href = el.get("href")
title = el.get("title")
if href:
try:
parsed_url = urlparse(href) # type: ignore
if parsed_url.scheme and parsed_url.scheme.lower() not in [
"http",
"https",
"file",
]: # type: ignore
return "%s%s%s" % (prefix, text, suffix)
href = urlunparse(
parsed_url._replace(path=quote(unquote(parsed_url.path)))
) # type: ignore
except ValueError:
return "%s%s%s" % (prefix, text, suffix)
if (
self.options["autolinks"]
and text.replace(r"\_", "_") == href
and not title
and not self.options["default_title"]
):
return "<%s>" % href
if self.options["default_title"] and not title:
title = href
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
return (
"%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
if href
else text
)
def convert_img(
self,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
alt = el.attrs.get("alt", None) or ""
src = el.attrs.get("src", None) or ""
title = el.attrs.get("title", None) or ""
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
if (
convert_as_inline
and el.parent.name not in self.options["keep_inline_images_in"]
):
return alt
if src.startswith("data:"):
src = src.split(",")[0] + "..."
return "" % (alt, src, title_part)
def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore