mirror of
https://github.com/HKUDS/RAG-Anything.git
synced 2025-08-20 19:01:34 +03:00
535 lines
15 KiB
Python
535 lines
15 KiB
Python
"""
|
|
Enhanced Markdown to PDF Conversion
|
|
|
|
This module provides improved Markdown to PDF conversion with:
|
|
- Better formatting and styling
|
|
- Image support
|
|
- Table support
|
|
- Code syntax highlighting
|
|
- Custom templates
|
|
- Multiple output formats
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional
|
|
from dataclasses import dataclass
|
|
import tempfile
|
|
import subprocess
|
|
|
|
try:
|
|
import markdown
|
|
|
|
MARKDOWN_AVAILABLE = True
|
|
except ImportError:
|
|
MARKDOWN_AVAILABLE = False
|
|
|
|
try:
|
|
from weasyprint import HTML
|
|
|
|
WEASYPRINT_AVAILABLE = True
|
|
except ImportError:
|
|
WEASYPRINT_AVAILABLE = False
|
|
|
|
try:
|
|
# Check if pandoc module exists (not used directly, just for detection)
|
|
import importlib.util
|
|
|
|
spec = importlib.util.find_spec("pandoc")
|
|
PANDOC_AVAILABLE = spec is not None
|
|
except ImportError:
|
|
PANDOC_AVAILABLE = False
|
|
|
|
|
|
@dataclass
|
|
class MarkdownConfig:
|
|
"""Configuration for Markdown to PDF conversion"""
|
|
|
|
# Styling options
|
|
css_file: Optional[str] = None
|
|
template_file: Optional[str] = None
|
|
page_size: str = "A4"
|
|
margin: str = "1in"
|
|
font_size: str = "12pt"
|
|
line_height: str = "1.5"
|
|
|
|
# Content options
|
|
include_toc: bool = True
|
|
syntax_highlighting: bool = True
|
|
image_max_width: str = "100%"
|
|
table_style: str = "border-collapse: collapse; width: 100%;"
|
|
|
|
# Output options
|
|
output_format: str = "pdf" # pdf, html, docx
|
|
output_dir: Optional[str] = None
|
|
|
|
# Advanced options
|
|
custom_css: Optional[str] = None
|
|
metadata: Optional[Dict[str, str]] = None
|
|
|
|
|
|
class EnhancedMarkdownConverter:
|
|
"""
|
|
Enhanced Markdown to PDF converter with multiple backends
|
|
|
|
Supports multiple conversion methods:
|
|
- WeasyPrint (recommended for HTML/CSS styling)
|
|
- Pandoc (recommended for complex documents)
|
|
- ReportLab (fallback, basic styling)
|
|
"""
|
|
|
|
def __init__(self, config: Optional[MarkdownConfig] = None):
|
|
"""
|
|
Initialize the converter
|
|
|
|
Args:
|
|
config: Configuration for conversion
|
|
"""
|
|
self.config = config or MarkdownConfig()
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
# Check available backends
|
|
self.available_backends = self._check_backends()
|
|
self.logger.info(f"Available backends: {list(self.available_backends.keys())}")
|
|
|
|
def _check_backends(self) -> Dict[str, bool]:
|
|
"""Check which conversion backends are available"""
|
|
backends = {
|
|
"weasyprint": WEASYPRINT_AVAILABLE,
|
|
"pandoc": PANDOC_AVAILABLE,
|
|
"markdown": MARKDOWN_AVAILABLE,
|
|
}
|
|
|
|
# Check if pandoc is installed on system
|
|
try:
|
|
subprocess.run(["pandoc", "--version"], capture_output=True, check=True)
|
|
backends["pandoc_system"] = True
|
|
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
backends["pandoc_system"] = False
|
|
|
|
return backends
|
|
|
|
def _get_default_css(self) -> str:
|
|
"""Get default CSS styling"""
|
|
return """
|
|
body {
|
|
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
|
line-height: 1.6;
|
|
color: #333;
|
|
max-width: 800px;
|
|
margin: 0 auto;
|
|
padding: 20px;
|
|
}
|
|
|
|
h1, h2, h3, h4, h5, h6 {
|
|
color: #2c3e50;
|
|
margin-top: 1.5em;
|
|
margin-bottom: 0.5em;
|
|
}
|
|
|
|
h1 { font-size: 2em; border-bottom: 2px solid #3498db; padding-bottom: 0.3em; }
|
|
h2 { font-size: 1.5em; border-bottom: 1px solid #bdc3c7; padding-bottom: 0.2em; }
|
|
h3 { font-size: 1.3em; }
|
|
h4 { font-size: 1.1em; }
|
|
|
|
p { margin-bottom: 1em; }
|
|
|
|
code {
|
|
background-color: #f8f9fa;
|
|
padding: 2px 4px;
|
|
border-radius: 3px;
|
|
font-family: 'Courier New', monospace;
|
|
font-size: 0.9em;
|
|
}
|
|
|
|
pre {
|
|
background-color: #f8f9fa;
|
|
padding: 15px;
|
|
border-radius: 5px;
|
|
overflow-x: auto;
|
|
border-left: 4px solid #3498db;
|
|
}
|
|
|
|
pre code {
|
|
background-color: transparent;
|
|
padding: 0;
|
|
}
|
|
|
|
blockquote {
|
|
border-left: 4px solid #3498db;
|
|
margin: 0;
|
|
padding-left: 20px;
|
|
color: #7f8c8d;
|
|
}
|
|
|
|
table {
|
|
border-collapse: collapse;
|
|
width: 100%;
|
|
margin: 1em 0;
|
|
}
|
|
|
|
th, td {
|
|
border: 1px solid #ddd;
|
|
padding: 8px 12px;
|
|
text-align: left;
|
|
}
|
|
|
|
th {
|
|
background-color: #f2f2f2;
|
|
font-weight: bold;
|
|
}
|
|
|
|
img {
|
|
max-width: 100%;
|
|
height: auto;
|
|
display: block;
|
|
margin: 1em auto;
|
|
}
|
|
|
|
ul, ol {
|
|
margin-bottom: 1em;
|
|
}
|
|
|
|
li {
|
|
margin-bottom: 0.5em;
|
|
}
|
|
|
|
a {
|
|
color: #3498db;
|
|
text-decoration: none;
|
|
}
|
|
|
|
a:hover {
|
|
text-decoration: underline;
|
|
}
|
|
|
|
.toc {
|
|
background-color: #f8f9fa;
|
|
padding: 15px;
|
|
border-radius: 5px;
|
|
margin-bottom: 2em;
|
|
}
|
|
|
|
.toc ul {
|
|
list-style-type: none;
|
|
padding-left: 0;
|
|
}
|
|
|
|
.toc li {
|
|
margin-bottom: 0.3em;
|
|
}
|
|
|
|
.toc a {
|
|
color: #2c3e50;
|
|
}
|
|
"""
|
|
|
|
def _process_markdown_content(self, content: str) -> str:
|
|
"""Process Markdown content with extensions"""
|
|
if not MARKDOWN_AVAILABLE:
|
|
raise RuntimeError(
|
|
"Markdown library not available. Install with: pip install markdown"
|
|
)
|
|
|
|
# Configure Markdown extensions
|
|
extensions = [
|
|
"markdown.extensions.tables",
|
|
"markdown.extensions.fenced_code",
|
|
"markdown.extensions.codehilite",
|
|
"markdown.extensions.toc",
|
|
"markdown.extensions.attr_list",
|
|
"markdown.extensions.def_list",
|
|
"markdown.extensions.footnotes",
|
|
]
|
|
|
|
extension_configs = {
|
|
"codehilite": {
|
|
"css_class": "highlight",
|
|
"use_pygments": True,
|
|
},
|
|
"toc": {
|
|
"title": "Table of Contents",
|
|
"permalink": True,
|
|
},
|
|
}
|
|
|
|
# Convert Markdown to HTML
|
|
md = markdown.Markdown(
|
|
extensions=extensions, extension_configs=extension_configs
|
|
)
|
|
|
|
html_content = md.convert(content)
|
|
|
|
# Add CSS styling
|
|
css = self.config.custom_css or self._get_default_css()
|
|
|
|
# Create complete HTML document
|
|
html_doc = f"""
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<title>Converted Document</title>
|
|
<style>
|
|
{css}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
{html_content}
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
return html_doc
|
|
|
|
def convert_with_weasyprint(self, markdown_content: str, output_path: str) -> bool:
|
|
"""Convert using WeasyPrint (best for styling)"""
|
|
if not WEASYPRINT_AVAILABLE:
|
|
raise RuntimeError(
|
|
"WeasyPrint not available. Install with: pip install weasyprint"
|
|
)
|
|
|
|
try:
|
|
# Process Markdown to HTML
|
|
html_content = self._process_markdown_content(markdown_content)
|
|
|
|
# Convert HTML to PDF
|
|
html = HTML(string=html_content)
|
|
html.write_pdf(output_path)
|
|
|
|
self.logger.info(
|
|
f"Successfully converted to PDF using WeasyPrint: {output_path}"
|
|
)
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"WeasyPrint conversion failed: {str(e)}")
|
|
return False
|
|
|
|
def convert_with_pandoc(
|
|
self, markdown_content: str, output_path: str, use_system_pandoc: bool = False
|
|
) -> bool:
|
|
"""Convert using Pandoc (best for complex documents)"""
|
|
if (
|
|
not self.available_backends.get("pandoc_system", False)
|
|
and not use_system_pandoc
|
|
):
|
|
raise RuntimeError(
|
|
"Pandoc not available. Install from: https://pandoc.org/installing.html"
|
|
)
|
|
|
|
temp_md_path = None
|
|
try:
|
|
import subprocess
|
|
|
|
# Create temporary markdown file
|
|
with tempfile.NamedTemporaryFile(
|
|
mode="w", suffix=".md", delete=False
|
|
) as temp_file:
|
|
temp_file.write(markdown_content)
|
|
temp_md_path = temp_file.name
|
|
|
|
# Build pandoc command with wkhtmltopdf engine
|
|
cmd = [
|
|
"pandoc",
|
|
temp_md_path,
|
|
"-o",
|
|
output_path,
|
|
"--pdf-engine=wkhtmltopdf",
|
|
"--standalone",
|
|
"--toc",
|
|
"--number-sections",
|
|
]
|
|
|
|
# Run pandoc
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
|
|
|
if result.returncode == 0:
|
|
self.logger.info(
|
|
f"Successfully converted to PDF using Pandoc: {output_path}"
|
|
)
|
|
return True
|
|
else:
|
|
self.logger.error(f"Pandoc conversion failed: {result.stderr}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Pandoc conversion failed: {str(e)}")
|
|
return False
|
|
|
|
finally:
|
|
if temp_md_path and os.path.exists(temp_md_path):
|
|
try:
|
|
os.unlink(temp_md_path)
|
|
except OSError as e:
|
|
self.logger.error(
|
|
f"Failed to clean up temp file {temp_md_path}: {str(e)}"
|
|
)
|
|
|
|
def convert_markdown_to_pdf(
|
|
self, markdown_content: str, output_path: str, method: str = "auto"
|
|
) -> bool:
|
|
"""
|
|
Convert markdown content to PDF
|
|
|
|
Args:
|
|
markdown_content: Markdown content to convert
|
|
output_path: Output PDF file path
|
|
method: Conversion method ("auto", "weasyprint", "pandoc", "pandoc_system")
|
|
|
|
Returns:
|
|
True if conversion successful, False otherwise
|
|
"""
|
|
if method == "auto":
|
|
method = self._get_recommended_backend()
|
|
|
|
try:
|
|
if method == "weasyprint":
|
|
return self.convert_with_weasyprint(markdown_content, output_path)
|
|
elif method == "pandoc":
|
|
return self.convert_with_pandoc(markdown_content, output_path)
|
|
elif method == "pandoc_system":
|
|
return self.convert_with_pandoc(
|
|
markdown_content, output_path, use_system_pandoc=True
|
|
)
|
|
else:
|
|
raise ValueError(f"Unknown conversion method: {method}")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"{method.title()} conversion failed: {str(e)}")
|
|
return False
|
|
|
|
def convert_file_to_pdf(
|
|
self, input_path: str, output_path: Optional[str] = None, method: str = "auto"
|
|
) -> bool:
|
|
"""
|
|
Convert Markdown file to PDF
|
|
|
|
Args:
|
|
input_path: Input Markdown file path
|
|
output_path: Output PDF file path (optional)
|
|
method: Conversion method
|
|
|
|
Returns:
|
|
bool: True if conversion successful
|
|
"""
|
|
input_path_obj = Path(input_path)
|
|
|
|
if not input_path_obj.exists():
|
|
raise FileNotFoundError(f"Input file not found: {input_path}")
|
|
|
|
# Read markdown content
|
|
try:
|
|
with open(input_path_obj, "r", encoding="utf-8") as f:
|
|
markdown_content = f.read()
|
|
except UnicodeDecodeError:
|
|
# Try with different encodings
|
|
for encoding in ["gbk", "latin-1", "cp1252"]:
|
|
try:
|
|
with open(input_path_obj, "r", encoding=encoding) as f:
|
|
markdown_content = f.read()
|
|
break
|
|
except UnicodeDecodeError:
|
|
continue
|
|
else:
|
|
raise RuntimeError(
|
|
f"Could not decode file {input_path} with any supported encoding"
|
|
)
|
|
|
|
# Determine output path
|
|
if output_path is None:
|
|
output_path = str(input_path_obj.with_suffix(".pdf"))
|
|
|
|
return self.convert_markdown_to_pdf(markdown_content, output_path, method)
|
|
|
|
def get_backend_info(self) -> Dict[str, Any]:
|
|
"""Get information about available backends"""
|
|
return {
|
|
"available_backends": self.available_backends,
|
|
"recommended_backend": self._get_recommended_backend(),
|
|
"config": {
|
|
"page_size": self.config.page_size,
|
|
"margin": self.config.margin,
|
|
"font_size": self.config.font_size,
|
|
"include_toc": self.config.include_toc,
|
|
"syntax_highlighting": self.config.syntax_highlighting,
|
|
},
|
|
}
|
|
|
|
def _get_recommended_backend(self) -> str:
|
|
"""Get recommended backend based on availability"""
|
|
if self.available_backends.get("pandoc_system", False):
|
|
return "pandoc"
|
|
elif self.available_backends.get("weasyprint", False):
|
|
return "weasyprint"
|
|
else:
|
|
return "none"
|
|
|
|
|
|
def main():
|
|
"""Command-line interface for enhanced markdown conversion"""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Enhanced Markdown to PDF conversion")
|
|
parser.add_argument("input", nargs="?", help="Input markdown file")
|
|
parser.add_argument("--output", "-o", help="Output PDF file")
|
|
parser.add_argument(
|
|
"--method",
|
|
choices=["auto", "weasyprint", "pandoc", "pandoc_system"],
|
|
default="auto",
|
|
help="Conversion method",
|
|
)
|
|
parser.add_argument("--css", help="Custom CSS file")
|
|
parser.add_argument("--info", action="store_true", help="Show backend information")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
)
|
|
|
|
# Create converter
|
|
config = MarkdownConfig()
|
|
if args.css:
|
|
config.css_file = args.css
|
|
|
|
converter = EnhancedMarkdownConverter(config)
|
|
|
|
# Show backend info if requested
|
|
if args.info:
|
|
info = converter.get_backend_info()
|
|
print("Backend Information:")
|
|
for backend, available in info["available_backends"].items():
|
|
status = "✅" if available else "❌"
|
|
print(f" {status} {backend}")
|
|
print(f"Recommended backend: {info['recommended_backend']}")
|
|
return 0
|
|
|
|
# Check if input file is provided
|
|
if not args.input:
|
|
parser.error("Input file is required when not using --info")
|
|
|
|
# Convert file
|
|
try:
|
|
success = converter.convert_file_to_pdf(
|
|
input_path=args.input, output_path=args.output, method=args.method
|
|
)
|
|
|
|
if success:
|
|
print(f"✅ Successfully converted {args.input} to PDF")
|
|
return 0
|
|
else:
|
|
print("❌ Conversion failed")
|
|
return 1
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {str(e)}")
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main())
|