Florence-2 MCP Server
by jkawamoto
Verified
- mcp-florence2
- src
- mcp_florence2
# server.py
#
# Copyright (c) 2025 Junpei Kawamoto
#
# This software is released under the MIT License.
#
# http://opensource.org/licenses/mit-license.php
import os
from contextlib import asynccontextmanager, contextmanager, closing, ExitStack
from dataclasses import dataclass
from functools import partial
from io import BytesIO
from os import PathLike
from typing import Protocol, AsyncIterator, Iterator
import requests
from PIL.Image import Image, open as open_image
from mcp.server import FastMCP
from mcp.server.fastmcp import Context
from pydantic import Field
from pypdfium2 import PdfDocument
from mcp_florence2.florence2 import Florence2, Florence2SP, CaptionLevel
@contextmanager
def get_images(src: PathLike | str) -> Iterator[list[Image]]:
"""Opens and returns a list of images from a file path or URL."""
if isinstance(src, str) and (src.startswith("http://") or src.startswith("https://")):
res = requests.get(src)
res.raise_for_status()
if res.headers["Content-Type"] == "application/pdf":
pass
with ExitStack() as stack:
images = []
with closing(PdfDocument(res.content)) as doc:
for page in doc:
images.append(stack.enter_context(page.render().to_pil()))
yield images
else:
with open_image(BytesIO(res.content)) as image:
yield [image]
else:
ext = os.path.splitext(src)[1].lower()
if ext == ".pdf":
with ExitStack() as stack:
images = []
with closing(PdfDocument(src)) as doc:
for page in doc:
images.append(stack.enter_context(page.render().to_pil()))
yield images
else:
with open_image(src) as image:
yield [image]
class Processor(Protocol):
"""Represents a protocol for processing image data.
This class provides an interface for implementing image processing
operations, including optical character recognition (OCR) and generating
captions based on the content of the images. It is meant to be used as a
guideline for defining specific processors that conform to this protocol.
"""
def ocr(self, images: list[Image]) -> list[str]:
"""Performs optical character recognition (OCR) on a list of images.
This function takes a list of images and processes each image using OCR
to retrieve the text content present within the images. The function
returns a list of strings, where each string corresponds to the text
extracted from the respective image in the input list.
"""
...
def caption(self, images: list[Image], level: CaptionLevel = CaptionLevel.NORMAL) -> list[str]:
"""Generates a list of captions for the given images based on the specified captioning level.
It processes an input list of images and returns the corresponding captions
in a text format. The caption level influences the verbosity or granularity
of the generated captions.
"""
...
@dataclass
class AppContext:
"""Context for the FastMCP app."""
processor: Processor
@asynccontextmanager
async def app_lifespan(_server: FastMCP, model_id: str, subprocess: bool) -> AsyncIterator[AppContext]:
"""Context manager for the FastMCP app lifespan."""
processor: Processor
if subprocess:
processor = Florence2SP(model_id)
else:
processor = Florence2(model_id)
yield AppContext(processor)
def new_server(name: str, model_id: str, subprocess: bool = True) -> FastMCP:
"""Creates a new FastMCP server instance with the specified name and model ID."""
mcp = FastMCP(name, lifespan=partial(app_lifespan, model_id=model_id, subprocess=subprocess))
@mcp.tool()
def ocr(
ctx: Context,
src: PathLike | str = Field(description="A file path or URL to the image file that needs to be processed."),
) -> list[str]:
"""Process an image file or URL using OCR to extract text."""
with get_images(src) as images:
app_ctx: AppContext = ctx.request_context.lifespan_context
return app_ctx.processor.ocr(images)
@mcp.tool()
def caption(
ctx: Context,
src: PathLike | str = Field(description="A file path or URL to the image file that needs to be processed."),
) -> list[str]:
"""Processes an image file and generates captions for the image."""
with get_images(src) as images:
app_ctx: AppContext = ctx.request_context.lifespan_context
return app_ctx.processor.caption(images, CaptionLevel.MORE_DETAILED)
return mcp