Files
office_document_test/raganything/enhanced_markdown.py
MinalMahalaShorthillsAI 60f05e04cf improvised version
2025-07-28 10:08:54 +05:30

535 lines
15 KiB
Python

"""
Enhanced Markdown to PDF Conversion
This module provides improved Markdown to PDF conversion with:
- Better formatting and styling
- Image support
- Table support
- Code syntax highlighting
- Custom templates
- Multiple output formats
"""
import os
import logging
from pathlib import Path
from typing import Dict, Any, Optional
from dataclasses import dataclass
import tempfile
import subprocess
try:
import markdown
MARKDOWN_AVAILABLE = True
except ImportError:
MARKDOWN_AVAILABLE = False
try:
from weasyprint import HTML
WEASYPRINT_AVAILABLE = True
except ImportError:
WEASYPRINT_AVAILABLE = False
try:
# Check if pandoc module exists (not used directly, just for detection)
import importlib.util
spec = importlib.util.find_spec("pandoc")
PANDOC_AVAILABLE = spec is not None
except ImportError:
PANDOC_AVAILABLE = False
@dataclass
class MarkdownConfig:
"""Configuration for Markdown to PDF conversion"""
# Styling options
css_file: Optional[str] = None
template_file: Optional[str] = None
page_size: str = "A4"
margin: str = "1in"
font_size: str = "12pt"
line_height: str = "1.5"
# Content options
include_toc: bool = True
syntax_highlighting: bool = True
image_max_width: str = "100%"
table_style: str = "border-collapse: collapse; width: 100%;"
# Output options
output_format: str = "pdf" # pdf, html, docx
output_dir: Optional[str] = None
# Advanced options
custom_css: Optional[str] = None
metadata: Optional[Dict[str, str]] = None
class EnhancedMarkdownConverter:
"""
Enhanced Markdown to PDF converter with multiple backends
Supports multiple conversion methods:
- WeasyPrint (recommended for HTML/CSS styling)
- Pandoc (recommended for complex documents)
- ReportLab (fallback, basic styling)
"""
def __init__(self, config: Optional[MarkdownConfig] = None):
"""
Initialize the converter
Args:
config: Configuration for conversion
"""
self.config = config or MarkdownConfig()
self.logger = logging.getLogger(__name__)
# Check available backends
self.available_backends = self._check_backends()
self.logger.info(f"Available backends: {list(self.available_backends.keys())}")
def _check_backends(self) -> Dict[str, bool]:
"""Check which conversion backends are available"""
backends = {
"weasyprint": WEASYPRINT_AVAILABLE,
"pandoc": PANDOC_AVAILABLE,
"markdown": MARKDOWN_AVAILABLE,
}
# Check if pandoc is installed on system
try:
subprocess.run(["pandoc", "--version"], capture_output=True, check=True)
backends["pandoc_system"] = True
except (subprocess.CalledProcessError, FileNotFoundError):
backends["pandoc_system"] = False
return backends
def _get_default_css(self) -> str:
"""Get default CSS styling"""
return """
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
line-height: 1.6;
color: #333;
max-width: 800px;
margin: 0 auto;
padding: 20px;
}
h1, h2, h3, h4, h5, h6 {
color: #2c3e50;
margin-top: 1.5em;
margin-bottom: 0.5em;
}
h1 { font-size: 2em; border-bottom: 2px solid #3498db; padding-bottom: 0.3em; }
h2 { font-size: 1.5em; border-bottom: 1px solid #bdc3c7; padding-bottom: 0.2em; }
h3 { font-size: 1.3em; }
h4 { font-size: 1.1em; }
p { margin-bottom: 1em; }
code {
background-color: #f8f9fa;
padding: 2px 4px;
border-radius: 3px;
font-family: 'Courier New', monospace;
font-size: 0.9em;
}
pre {
background-color: #f8f9fa;
padding: 15px;
border-radius: 5px;
overflow-x: auto;
border-left: 4px solid #3498db;
}
pre code {
background-color: transparent;
padding: 0;
}
blockquote {
border-left: 4px solid #3498db;
margin: 0;
padding-left: 20px;
color: #7f8c8d;
}
table {
border-collapse: collapse;
width: 100%;
margin: 1em 0;
}
th, td {
border: 1px solid #ddd;
padding: 8px 12px;
text-align: left;
}
th {
background-color: #f2f2f2;
font-weight: bold;
}
img {
max-width: 100%;
height: auto;
display: block;
margin: 1em auto;
}
ul, ol {
margin-bottom: 1em;
}
li {
margin-bottom: 0.5em;
}
a {
color: #3498db;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
.toc {
background-color: #f8f9fa;
padding: 15px;
border-radius: 5px;
margin-bottom: 2em;
}
.toc ul {
list-style-type: none;
padding-left: 0;
}
.toc li {
margin-bottom: 0.3em;
}
.toc a {
color: #2c3e50;
}
"""
def _process_markdown_content(self, content: str) -> str:
"""Process Markdown content with extensions"""
if not MARKDOWN_AVAILABLE:
raise RuntimeError(
"Markdown library not available. Install with: pip install markdown"
)
# Configure Markdown extensions
extensions = [
"markdown.extensions.tables",
"markdown.extensions.fenced_code",
"markdown.extensions.codehilite",
"markdown.extensions.toc",
"markdown.extensions.attr_list",
"markdown.extensions.def_list",
"markdown.extensions.footnotes",
]
extension_configs = {
"codehilite": {
"css_class": "highlight",
"use_pygments": True,
},
"toc": {
"title": "Table of Contents",
"permalink": True,
},
}
# Convert Markdown to HTML
md = markdown.Markdown(
extensions=extensions, extension_configs=extension_configs
)
html_content = md.convert(content)
# Add CSS styling
css = self.config.custom_css or self._get_default_css()
# Create complete HTML document
html_doc = f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Converted Document</title>
<style>
{css}
</style>
</head>
<body>
{html_content}
</body>
</html>
"""
return html_doc
def convert_with_weasyprint(self, markdown_content: str, output_path: str) -> bool:
"""Convert using WeasyPrint (best for styling)"""
if not WEASYPRINT_AVAILABLE:
raise RuntimeError(
"WeasyPrint not available. Install with: pip install weasyprint"
)
try:
# Process Markdown to HTML
html_content = self._process_markdown_content(markdown_content)
# Convert HTML to PDF
html = HTML(string=html_content)
html.write_pdf(output_path)
self.logger.info(
f"Successfully converted to PDF using WeasyPrint: {output_path}"
)
return True
except Exception as e:
self.logger.error(f"WeasyPrint conversion failed: {str(e)}")
return False
def convert_with_pandoc(
self, markdown_content: str, output_path: str, use_system_pandoc: bool = False
) -> bool:
"""Convert using Pandoc (best for complex documents)"""
if (
not self.available_backends.get("pandoc_system", False)
and not use_system_pandoc
):
raise RuntimeError(
"Pandoc not available. Install from: https://pandoc.org/installing.html"
)
temp_md_path = None
try:
import subprocess
# Create temporary markdown file
with tempfile.NamedTemporaryFile(
mode="w", suffix=".md", delete=False
) as temp_file:
temp_file.write(markdown_content)
temp_md_path = temp_file.name
# Build pandoc command with wkhtmltopdf engine
cmd = [
"pandoc",
temp_md_path,
"-o",
output_path,
"--pdf-engine=wkhtmltopdf",
"--standalone",
"--toc",
"--number-sections",
]
# Run pandoc
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
if result.returncode == 0:
self.logger.info(
f"Successfully converted to PDF using Pandoc: {output_path}"
)
return True
else:
self.logger.error(f"Pandoc conversion failed: {result.stderr}")
return False
except Exception as e:
self.logger.error(f"Pandoc conversion failed: {str(e)}")
return False
finally:
if temp_md_path and os.path.exists(temp_md_path):
try:
os.unlink(temp_md_path)
except OSError as e:
self.logger.error(
f"Failed to clean up temp file {temp_md_path}: {str(e)}"
)
def convert_markdown_to_pdf(
self, markdown_content: str, output_path: str, method: str = "auto"
) -> bool:
"""
Convert markdown content to PDF
Args:
markdown_content: Markdown content to convert
output_path: Output PDF file path
method: Conversion method ("auto", "weasyprint", "pandoc", "pandoc_system")
Returns:
True if conversion successful, False otherwise
"""
if method == "auto":
method = self._get_recommended_backend()
try:
if method == "weasyprint":
return self.convert_with_weasyprint(markdown_content, output_path)
elif method == "pandoc":
return self.convert_with_pandoc(markdown_content, output_path)
elif method == "pandoc_system":
return self.convert_with_pandoc(
markdown_content, output_path, use_system_pandoc=True
)
else:
raise ValueError(f"Unknown conversion method: {method}")
except Exception as e:
self.logger.error(f"{method.title()} conversion failed: {str(e)}")
return False
def convert_file_to_pdf(
self, input_path: str, output_path: Optional[str] = None, method: str = "auto"
) -> bool:
"""
Convert Markdown file to PDF
Args:
input_path: Input Markdown file path
output_path: Output PDF file path (optional)
method: Conversion method
Returns:
bool: True if conversion successful
"""
input_path_obj = Path(input_path)
if not input_path_obj.exists():
raise FileNotFoundError(f"Input file not found: {input_path}")
# Read markdown content
try:
with open(input_path_obj, "r", encoding="utf-8") as f:
markdown_content = f.read()
except UnicodeDecodeError:
# Try with different encodings
for encoding in ["gbk", "latin-1", "cp1252"]:
try:
with open(input_path_obj, "r", encoding=encoding) as f:
markdown_content = f.read()
break
except UnicodeDecodeError:
continue
else:
raise RuntimeError(
f"Could not decode file {input_path} with any supported encoding"
)
# Determine output path
if output_path is None:
output_path = str(input_path_obj.with_suffix(".pdf"))
return self.convert_markdown_to_pdf(markdown_content, output_path, method)
def get_backend_info(self) -> Dict[str, Any]:
"""Get information about available backends"""
return {
"available_backends": self.available_backends,
"recommended_backend": self._get_recommended_backend(),
"config": {
"page_size": self.config.page_size,
"margin": self.config.margin,
"font_size": self.config.font_size,
"include_toc": self.config.include_toc,
"syntax_highlighting": self.config.syntax_highlighting,
},
}
def _get_recommended_backend(self) -> str:
"""Get recommended backend based on availability"""
if self.available_backends.get("pandoc_system", False):
return "pandoc"
elif self.available_backends.get("weasyprint", False):
return "weasyprint"
else:
return "none"
def main():
"""Command-line interface for enhanced markdown conversion"""
import argparse
parser = argparse.ArgumentParser(description="Enhanced Markdown to PDF conversion")
parser.add_argument("input", nargs="?", help="Input markdown file")
parser.add_argument("--output", "-o", help="Output PDF file")
parser.add_argument(
"--method",
choices=["auto", "weasyprint", "pandoc", "pandoc_system"],
default="auto",
help="Conversion method",
)
parser.add_argument("--css", help="Custom CSS file")
parser.add_argument("--info", action="store_true", help="Show backend information")
args = parser.parse_args()
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
# Create converter
config = MarkdownConfig()
if args.css:
config.css_file = args.css
converter = EnhancedMarkdownConverter(config)
# Show backend info if requested
if args.info:
info = converter.get_backend_info()
print("Backend Information:")
for backend, available in info["available_backends"].items():
status = "" if available else ""
print(f" {status} {backend}")
print(f"Recommended backend: {info['recommended_backend']}")
return 0
# Check if input file is provided
if not args.input:
parser.error("Input file is required when not using --info")
# Convert file
try:
success = converter.convert_file_to_pdf(
input_path=args.input, output_path=args.output, method=args.method
)
if success:
print(f"✅ Successfully converted {args.input} to PDF")
return 0
else:
print("❌ Conversion failed")
return 1
except Exception as e:
print(f"❌ Error: {str(e)}")
return 1
if __name__ == "__main__":
exit(main())