Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions pdf-table-extraction-docling-vs-llamaparse/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# PDF Table Extraction: Docling vs LlamaParse

This folder contains the code examples for the Real Python tutorial [PDF Table Extraction: Docling vs LlamaParse](https://realpython.com/pdf-table-extraction-docling-vs-llamaparse/).

The scripts parse `sample_report.pdf`, a short financial report with tables, and compare two approaches:

- **[Docling](https://github.com/docling-project/docling)** runs locally and exports structured document data, including tables as pandas DataFrames.
- **[LlamaParse](https://docs.cloud.llamaindex.ai/llamaparse/getting_started)** uses the Llama Cloud API for parsing and schema-driven extraction.

## Files

| File | Description |
|------|-------------|
| `sample_report.pdf` | Sample PDF used by all scripts |
| `docling_extraction.py` | Parse the PDF with Docling and print Markdown output |
| `docling_tables.py` | Inspect detected tables and print selected DataFrames |
| `docling_formats.py` | Export Docling results to Markdown, JSON, HTML, and DataFrames |
| `llamaparse_extraction.py` | Parse the PDF with LlamaParse and print Markdown output |
| `llamaparse_formats.py` | Export LlamaParse results to Markdown, plain text, and JSON |
| `requirements.txt` | Pinned dependencies for this folder |

## Installation

Create and activate a [virtual environment](https://realpython.com/python-virtual-environments-a-primer/), then install the dependencies:

```shell
$ python3 -m venv venv/
$ source venv/bin/activate
(venv) $ python -m pip install -r requirements.txt
```

Run the scripts from this folder so the relative path to `sample_report.pdf` resolves correctly.

## Docling examples

Docling runs on your machine and does not require an API key.

```shell
(venv) $ python docling_extraction.py
(venv) $ python docling_tables.py
(venv) $ python docling_formats.py
```

`docling_formats.py` writes `output_docling.md`, `output_docling.json`, and `output_docling.html` in the current directory.

## LlamaParse examples

The LlamaParse scripts require a [Llama Cloud API key](https://cloud.llamaindex.ai/). Export it before running:

```shell
(venv) $ export LLAMA_CLOUD_API_KEY="your-api-key"
(venv) $ python llamaparse_extraction.py
(venv) $ python llamaparse_formats.py
```

`llamaparse_formats.py` writes `output_llamaparse.md`, `output_llamaparse.text`, and `output_llamaparse.json` in the current directory.
22 changes: 22 additions & 0 deletions pdf-table-extraction-docling-vs-llamaparse/docling_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""Parse a PDF with Docling and print Markdown output."""

from pathlib import Path

from docling.document_converter import DocumentConverter

PDF_PATH = Path("sample_report.pdf")


def main() -> None:
converter = DocumentConverter()
result = converter.convert(PDF_PATH)

markdown = result.document.export_to_markdown()
print(markdown[:3000])
print("\n---\n")
print(f"Pages parsed: {len(result.document.pages)}")
print(f"Tables found: {len(result.document.tables)}")


if __name__ == "__main__":
main()
34 changes: 34 additions & 0 deletions pdf-table-extraction-docling-vs-llamaparse/docling_formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Export Docling parse results to Markdown, JSON, HTML, and pandas DataFrames."""

import json
from pathlib import Path

from docling.document_converter import DocumentConverter

PDF_PATH = Path("sample_report.pdf")


def main() -> None:
converter = DocumentConverter()
document = converter.convert(PDF_PATH).document

markdown = document.export_to_markdown()
Path("output_docling.md").write_text(markdown, encoding="utf-8")

payload = document.export_to_dict()
Path("output_docling.json").write_text(
json.dumps(payload, indent=2),
encoding="utf-8",
)

html = document.export_to_html()
Path("output_docling.html").write_text(html, encoding="utf-8")

for index, table in enumerate(document.tables):
frame = table.export_to_dataframe(doc=document)
print(f"Table {index} shape: {frame.shape}")
print(frame.head(), end="\n\n")


if __name__ == "__main__":
main()
30 changes: 30 additions & 0 deletions pdf-table-extraction-docling-vs-llamaparse/docling_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Inspect and export tables from a Docling parse result."""

from pathlib import Path

from docling.document_converter import DocumentConverter

PDF_PATH = Path("sample_report.pdf")


def main() -> None:
document = DocumentConverter().convert(PDF_PATH).document

print(f"Tables found: {len(document.tables)}\n")

for index, table in enumerate(document.tables):
pages = sorted({prov.page_no for prov in table.prov})
frame = table.export_to_dataframe(doc=document)
print(f"Table {index}: pages {pages}, shape {frame.shape}")

index_table = document.tables[0].export_to_dataframe(doc=document)
print("\nFinancial statement index (table 0):")
print(index_table.to_string(index=False), end="\n\n")

operations_table = document.tables[1].export_to_dataframe(doc=document)
print("Operations statement preview (table 1, first 4 rows):")
print(operations_table.head(4).to_string())


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Parse a PDF with LlamaParse (llama-cloud SDK) and print Markdown output."""

import os
from pathlib import Path

from llama_cloud import LlamaCloud

PDF_PATH = Path("sample_report.pdf")


def main() -> None:
client = LlamaCloud(api_key=os.environ["LLAMA_CLOUD_API_KEY"])

uploaded = client.files.create(file=PDF_PATH, purpose="parse")
result = client.parsing.parse(
file_id=uploaded.id,
tier="agentic",
version="latest",
expand=["markdown"],
)

pages = ""
for page in result.markdown.pages:
pages += page.markdown
pages += "\n---\n"

print(pages[:3000])
print(f"Pages parsed: {len(result.markdown.pages)}")


if __name__ == "__main__":
main()
71 changes: 71 additions & 0 deletions pdf-table-extraction-docling-vs-llamaparse/llamaparse_formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""Export LlamaParse results to Markdown, Text, and schema-driven JSON."""

import json
import os
from pathlib import Path

from llama_cloud import LlamaCloud
from pydantic import BaseModel, Field

PDF_PATH = Path("sample_report.pdf")


class RevenueRow(BaseModel):
quarter: str = Field(
description="Fiscal quarter label, e.g. Q1 2024",
)
revenue_millions: float = Field(
description="Revenue in millions of USD",
)
growth_percent: float | None = Field(
default=None,
description="Year-over-year growth percentage if stated",
)


class RevenueTable(BaseModel):
rows: list[RevenueRow] = Field(
description="One row per quarter in the table"
)


def main() -> None:
client = LlamaCloud(api_key=os.environ["LLAMA_CLOUD_API_KEY"])

uploaded = client.files.create(file=PDF_PATH, purpose="parse")

parsed = client.parsing.parse(
file_id=uploaded.id,
tier="agentic",
version="latest",
expand=["markdown", "text"],
)

markdown_pages = "\n\n".join(
page.markdown for page in parsed.markdown.pages
)
Path("output_llamaparse.md").write_text(markdown_pages, encoding="utf-8")

if parsed.text and parsed.text.pages:
text_pages = "\n".join(page.text for page in parsed.text.pages)
Path("output_llamaparse.text").write_text(text_pages, encoding="utf-8")

extract_file = client.files.create(file=PDF_PATH, purpose="extract")
job = client.extract.run(
file_input=extract_file.id,
configuration={
"data_schema": RevenueTable.model_json_schema(),
"extraction_target": "per_doc",
"tier": "agentic",
},
)

Path("output_llamaparse.json").write_text(
json.dumps(job.extract_result, indent=2),
encoding="utf-8",
)
print(json.dumps(job.extract_result, indent=2))


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions pdf-table-extraction-docling-vs-llamaparse/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
docling==2.102.2
onnxruntime>=1.7.0,<2.0.0
llama-cloud>=2.9.0
pandas>=2.0.0
pydantic>=2.0.0
Binary file not shown.
Loading