Skip to main content

Overview

The Granite Docling model is designed for enterprise applications that involve end-to-end document understanding and conversion. Unlike traditional pipelines that chain together OCR, layout analysis, and post-processing, Docling integrates vision and language into a single compact model (≈ 258M parameters), capable of parsing PDFs, slides, and scanned pages directly into structured, machine-readable formats. A key feature is its use of DocTags, a purpose-built markup language that separates content from layout while faithfully preserving tables, code blocks, inline and block math, and document hierarchy. This design improves fidelity, minimizes errors in reading order and structure, and makes outputs well-suited for downstream tasks such as retrieval-augmented generation (RAG) and fine-tuning. While optimized for Latin-script documents, it also offers early support for Japanese, Chinese, and Arabic. Granite Docling is released under the Apache 2.0 license, ensuring it is freely available for both research and commercial use. It complements the open-source Docling library by providing an all-in-one, production-ready alternative for document conversion, with full transparency into its design and training.

Getting Started

The easiest way to use this model is through the 🐥Docling library. It will automatically download this model and convert documents to various formats for you. Install the latest version of docling through pip, then use the following CLI command:
# Convert to HTML and Markdown:
docling --to html --to md --pipeline vlm --vlm-model granite_docling "https://arxiv.org/pdf/2501.17887" # accepts files, urls or directories

# Convert to HTML including layout visualization:
docling --to html_split_page --show-layout --pipeline vlm --vlm-model granite_docling "https://arxiv.org/pdf/2501.17887"
You can also set this model up within the Docling SDK:
from docling.datamodel import vlm_model_specs
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline

source = "https://arxiv.org/pdf/2501.17887"

###### USING SIMPLE DEFAULT VALUES
# - GraniteDocling model
# - Using the transformers framework

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=VlmPipeline,
        ),
    }
)

doc = converter.convert(source=source).document

print(doc.export_to_markdown())


###### USING MACOS MPS ACCELERATOR
# For more options see the compare_vlm_models.py example.

pipeline_options = VlmPipelineOptions(
    vlm_options=vlm_model_specs.GRANITEDOCLING_MLX,
)

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=VlmPipeline,
            pipeline_options=pipeline_options,
        ),
    }
)

doc = converter.convert(source=source).document

print(doc.export_to_markdown())
Alternatively, you can use bare transformers, vllm, onnx or mlx-vlm to perform inference, and docling-core APIs to convert results to variety of output formats (md, html, etc.). Here’s an example of single-page image inference using plain transformers.
# Prerequisites:
# pip install torch
# pip install docling_core
# pip install transformers

import torch
from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.document import DocTagsDocument
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image
from pathlib import Path

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load images
image = load_image("https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png")

# Initialize processor and model
processor = AutoProcessor.from_pretrained("ibm-granite/granite-docling-258M")
model = AutoModelForVision2Seq.from_pretrained(
    "ibm-granite/granite-docling-258M",
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "sdpa",
).to(DEVICE)

# Create input messages
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "Convert this page to docling."}
        ]
    },
]

# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = inputs.to(DEVICE)

# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=8192)
prompt_length = inputs.input_ids.shape[1]
trimmed_generated_ids = generated_ids[:, prompt_length:]
doctags = processor.batch_decode(
    trimmed_generated_ids,
    skip_special_tokens=False,
)[0].lstrip()

print(f"DocTags: \n{doctags}\n")


# Populate document
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
# create a docling document
doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
print(f"Markdown:\n{doc.export_to_markdown()}\n")

## export as any format.
# Path("out/").mkdir(parents=True, exist_ok=True)
# HTML:
# output_path_html = Path("out/") / "example.html"
# doc.save_as_html(output_path_html)
# Markdown:
# output_path_md = Path("out/") / "example.md"
# doc.save_as_markdown(output_path_md)
Here’s an example of fast batch inference using VLLM:
# Prerequisites:
# pip install vllm
# pip install docling_core
# place page images you want to convert into "img/" dir

import time
import os
from vllm import LLM, SamplingParams
from transformers import AutoProcessor
from PIL import Image
from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.document import DocTagsDocument
from pathlib import Path

# Configuration
MODEL_PATH = "ibm-granite/granite-docling-258M"
IMAGE_DIR = "img/"  # Place your page images here
OUTPUT_DIR = "out/"
PROMPT_TEXT = "Convert this page to docling."

messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": PROMPT_TEXT},
        ],
    },
]


# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Initialize LLM
llm = LLM(model=MODEL_PATH, revision="untied", limit_mm_per_prompt={"image": 1})
processor = AutoProcessor.from_pretrained(MODEL_PATH)

sampling_params = SamplingParams(
    temperature=0.0,
    max_tokens=8192,
    skip_special_tokens=False,
)

# Load and prepare all images and prompts up front
batched_inputs = []
image_names = []

for img_file in sorted(os.listdir(IMAGE_DIR)):
    if img_file.lower().endswith((".png", ".jpg", ".jpeg")):
        img_path = os.path.join(IMAGE_DIR, img_file)
        with Image.open(img_path) as im:
            image = im.convert("RGB")

        prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
        batched_inputs.append({"prompt": prompt, "multi_modal_data": {"image": image}})
        image_names.append(os.path.splitext(img_file)[0])

# Run batch inference
start_time = time.time()
outputs = llm.generate(batched_inputs, sampling_params=sampling_params)

# Postprocess all results
for img_fn, output, input_data in zip(image_names, outputs, batched_inputs):
    doctags = output.outputs[0].text
    output_path_dt = Path(OUTPUT_DIR) / f"{img_fn}.dt"
    output_path_md = Path(OUTPUT_DIR) / f"{img_fn}.md"

    with open(output_path_dt, "w", encoding="utf-8") as f:
        f.write(doctags)

    # Convert to DoclingDocument and save markdown
    doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [input_data["multi_modal_data"]["image"]])
    doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
    doc.save_as_markdown(output_path_md)

print(f"Total time: {time.time() - start_time:.2f} sec")