gateway/modules/services/serviceGeneration/renderers/docx_renderer.py
2025-10-03 22:40:41 +02:00

293 lines
No EOL
11 KiB
Python

"""
DOCX renderer for report generation using python-docx.
"""
from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
import io
import base64
from datetime import datetime, UTC
try:
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.oxml.shared import OxmlElement, qn
from docx.oxml.ns import nsdecls
from docx.oxml import parse_xml
DOCX_AVAILABLE = True
except ImportError:
DOCX_AVAILABLE = False
class DocxRenderer(BaseRenderer):
"""Renders content to DOCX format using python-docx."""
@classmethod
def get_supported_formats(cls) -> List[str]:
"""Return supported DOCX formats."""
return ['docx', 'doc']
@classmethod
def get_format_aliases(cls) -> List[str]:
"""Return format aliases."""
return ['word', 'document']
@classmethod
def get_priority(cls) -> int:
"""Return priority for DOCX renderer."""
return 115
def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
"""Get DOCX-specific extraction prompt."""
return f"""
{user_prompt}
Generate a comprehensive DOCX report with the title: "{title}"
DOCX FORMAT REQUIREMENTS:
- Create structured content suitable for Word documents
- Use clear headings and sections with proper hierarchy
- Include tables for structured data
- Use bullet points and numbered lists where appropriate
- Include source document information
- Structure content for professional presentation
- Use consistent formatting throughout
DOCX STRUCTURE:
- Title page with report title and generation date
- Table of contents (if multiple sections)
- Executive summary
- Main content sections with clear headings
- Data tables and analysis
- Conclusions and recommendations
- Appendices with source information
FORMATTING RULES:
- Use clear section headings (H1, H2, H3 style)
- Include consistent paragraph formatting
- Use tables with proper alignment and borders
- Use bullet points and numbered lists
- Add source citations and references
- Include generation metadata
- Use professional fonts and spacing
OUTPUT POLICY:
- Return ONLY plain text content suitable for Word document generation
- NO markdown formatting (no **bold**, no # headings, no --- separators)
- NO HTML tags
- NO code blocks
- Use plain text with clear structure
- Use line breaks for separation
- Use indentation for lists
- Use ALL CAPS for major headings
- Use Title Case for subheadings
- Use bullet points with dashes (-) for lists
- Use numbers (1., 2., 3.) for numbered lists
- Professional document format
- Include all necessary information
CRITICAL: Use the actual data from the source documents to create the content. Do not generate placeholder text or templates. Extract and use the real data provided in the source documents to create meaningful content.
Generate the complete DOCX report content using the actual data from the source documents:
"""
async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
"""Render extracted content to DOCX format."""
try:
if not DOCX_AVAILABLE:
# Fallback to HTML if python-docx not available
from .html_renderer import HtmlRenderer
html_renderer = HtmlRenderer()
html_content, _ = await html_renderer.render(extracted_content, title)
return html_content, "text/html"
# Generate DOCX using python-docx
docx_content = self._generate_docx(extracted_content, title)
return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
except Exception as e:
self.logger.error(f"Error rendering DOCX: {str(e)}")
# Return minimal fallback
return f"DOCX Generation Error: {str(e)}", "text/plain"
def _generate_docx(self, content: str, title: str) -> str:
"""Generate DOCX content using python-docx."""
try:
# Create new document
doc = Document()
# Set up document styles
self._setup_document_styles(doc)
# Add title
title_para = doc.add_heading(title, 0)
title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Add generation date
date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}")
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Add page break
doc.add_page_break()
# Process content
lines = content.split('\n')
current_section = []
for line in lines:
line = line.strip()
if not line:
continue
# Check for ALL CAPS headings (major headings)
if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'):
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line, level=1)
# Check for Title Case headings (subheadings)
elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')):
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line, level=2)
# Check for markdown headings (fallback)
elif line.startswith('# '):
# H1 heading
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line[2:], level=1)
elif line.startswith('## '):
# H2 heading
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line[3:], level=2)
elif line.startswith('### '):
# H3 heading
if current_section:
self._process_section(doc, current_section)
current_section = []
doc.add_heading(line[4:], level=3)
else:
current_section.append(line)
# Process remaining content
if current_section:
self._process_section(doc, current_section)
# Save to buffer
buffer = io.BytesIO()
doc.save(buffer)
buffer.seek(0)
# Convert to base64
docx_bytes = buffer.getvalue()
docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')
return docx_base64
except Exception as e:
self.logger.error(f"Error generating DOCX: {str(e)}")
raise
def _setup_document_styles(self, doc):
"""Set up document styles."""
try:
# Set default font
style = doc.styles['Normal']
font = style.font
font.name = 'Calibri'
font.size = Pt(11)
# Set heading styles
for i in range(1, 4):
heading_style = doc.styles[f'Heading {i}']
heading_font = heading_style.font
heading_font.name = 'Calibri'
heading_font.size = Pt(16 - i * 2)
heading_font.bold = True
except Exception as e:
self.logger.warning(f"Could not set up document styles: {str(e)}")
def _process_section(self, doc, lines: list):
"""Process a section of content into DOCX elements."""
for line in lines:
if not line.strip():
continue
# Check for tables (lines with |)
if '|' in line and not line.startswith('|'):
# This might be part of a table, process as table
table_data = self._extract_table_data(lines)
if table_data:
self._add_table(doc, table_data)
return
# Check for lists
if line.startswith('- ') or line.startswith('* '):
# This is a list item
doc.add_paragraph(line[2:], style='List Bullet')
elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
# This is a numbered list item
doc.add_paragraph(line[3:], style='List Number')
else:
# Regular paragraph
doc.add_paragraph(line)
def _extract_table_data(self, lines: list) -> list:
"""Extract table data from lines."""
table_data = []
in_table = False
for line in lines:
if '|' in line:
if not in_table:
in_table = True
# Split by | and clean up
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
if cells:
table_data.append(cells)
elif in_table and not line.strip():
# Empty line, might be end of table
break
return table_data if len(table_data) > 1 else []
def _add_table(self, doc, table_data: list):
"""Add a table to the document."""
try:
if not table_data:
return
# Create table
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
table.alignment = WD_TABLE_ALIGNMENT.CENTER
# Add data to table
for row_idx, row_data in enumerate(table_data):
for col_idx, cell_data in enumerate(row_data):
if col_idx < len(table.rows[row_idx].cells):
table.rows[row_idx].cells[col_idx].text = cell_data
# Style the table
self._style_table(table)
except Exception as e:
self.logger.warning(f"Could not add table: {str(e)}")
def _style_table(self, table):
"""Apply styling to the table."""
try:
# Style header row
if len(table.rows) > 0:
header_cells = table.rows[0].cells
for cell in header_cells:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
except Exception as e:
self.logger.warning(f"Could not style table: {str(e)}")