import sys
import json
import os
# Set environment variables before imports to suppress logs
os.environ["GRPC_VERBOSITY"] = "ERROR"
os.environ["GLOG_minloglevel"] = "2"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
# Set cache dir to local project to avoid permission issues
cache_dir = os.path.join(os.getcwd(), ".cache")
os.environ["HF_HOME"] = cache_dir
os.environ["TORCH_HOME"] = cache_dir
os.environ["MODEL_CACHE_DIR"] = os.path.join(cache_dir, "surya") # Surya cache
try:
from marker.config.parser import ConfigParser
from marker.models import create_model_dict
from marker.output import text_from_rendered
except ImportError:
# Fail gracefully if marker is not installed (though it should be now)
print(json.dumps({"error": "marker-pdf not installed or imports failed"}))
sys.exit(1)
def main():
if len(sys.argv) < 2:
print(json.dumps({"error": "Usage: python convert.py <pdf_path>"}))
sys.exit(1)
pdf_path = sys.argv[1]
kwargs = {
"output_format": "markdown",
}
# Parse optional page range (1-based from arguments)
if len(sys.argv) >= 4:
try:
start_page = int(sys.argv[2])
end_page = int(sys.argv[3])
# Convert 1-based to 0-based for marker
# formatted as "start-end"
kwargs["page_range"] = f"{start_page-1}-{end_page-1}"
except ValueError:
pass # Ignore invalid ints
try:
# Initialize configuration
config_parser = ConfigParser(kwargs)
# Create model dictionary
models = create_model_dict()
# Get converter class and instance
converter_cls = config_parser.get_converter_cls()
converter = converter_cls(
config=config_parser.generate_config_dict(),
artifact_dict=models,
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer(),
llm_service=config_parser.get_llm_service(),
)
# Run conversion
rendered = converter(pdf_path)
# Extract text/markdown
text, ext, images = text_from_rendered(rendered)
# Prepare result
result = {
"text": text,
"metadata": rendered.metadata if hasattr(rendered, "metadata") else {}
}
print(json.dumps(result))
except Exception as e:
error_result = {
"error": str(e)
}
print(json.dumps(error_result))
sys.exit(1)
if __name__ == "__main__":
main()