""" DOCX renderer for report generation using python-docx. """ from .base_renderer import BaseRenderer from typing import Dict, Any, Tuple, List import io import base64 from datetime import datetime, UTC try: from docx import Document from docx.shared import Inches, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT from docx.oxml.shared import OxmlElement, qn from docx.oxml.ns import nsdecls from docx.oxml import parse_xml DOCX_AVAILABLE = True except ImportError: DOCX_AVAILABLE = False class DocxRenderer(BaseRenderer): """Renders content to DOCX format using python-docx.""" @classmethod def get_supported_formats(cls) -> List[str]: """Return supported DOCX formats.""" return ['docx', 'doc'] @classmethod def get_format_aliases(cls) -> List[str]: """Return format aliases.""" return ['word', 'document'] @classmethod def get_priority(cls) -> int: """Return priority for DOCX renderer.""" return 115 def getExtractionPrompt(self, user_prompt: str, title: str) -> str: """Return only DOCX-specific guidelines; global prompt is built centrally.""" return ( "DOCX FORMAT GUIDELINES:\n" "- Provide plain text content suitable for Word generation (no markdown/HTML).\n" "- Use clear section hierarchy; bullet and numbered lists where needed.\n" "- Include tables as simple pipe-delimited lines if tabular data is needed.\n" "OUTPUT: Return ONLY the structured plain text to be converted into DOCX." ) async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: """Render extracted content to DOCX format.""" try: if not DOCX_AVAILABLE: # Fallback to HTML if python-docx not available from .html_renderer import HtmlRenderer html_renderer = HtmlRenderer() html_content, _ = await html_renderer.render(extracted_content, title) return html_content, "text/html" # Generate DOCX using python-docx docx_content = self._generate_docx(extracted_content, title) return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" except Exception as e: self.logger.error(f"Error rendering DOCX: {str(e)}") # Return minimal fallback return f"DOCX Generation Error: {str(e)}", "text/plain" def _generate_docx(self, content: str, title: str) -> str: """Generate DOCX content using python-docx.""" try: # Create new document doc = Document() # Set up document styles self._setup_document_styles(doc) # Add title title_para = doc.add_heading(title, 0) title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER # Add generation date date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}") date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER # Add page break doc.add_page_break() # Process content lines = content.split('\n') current_section = [] for line in lines: line = line.strip() if not line: continue # Check for ALL CAPS headings (major headings) if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'): if current_section: self._process_section(doc, current_section) current_section = [] doc.add_heading(line, level=1) # Check for Title Case headings (subheadings) elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')): if current_section: self._process_section(doc, current_section) current_section = [] doc.add_heading(line, level=2) # Check for markdown headings (fallback) elif line.startswith('# '): # H1 heading if current_section: self._process_section(doc, current_section) current_section = [] doc.add_heading(line[2:], level=1) elif line.startswith('## '): # H2 heading if current_section: self._process_section(doc, current_section) current_section = [] doc.add_heading(line[3:], level=2) elif line.startswith('### '): # H3 heading if current_section: self._process_section(doc, current_section) current_section = [] doc.add_heading(line[4:], level=3) else: current_section.append(line) # Process remaining content if current_section: self._process_section(doc, current_section) # Save to buffer buffer = io.BytesIO() doc.save(buffer) buffer.seek(0) # Convert to base64 docx_bytes = buffer.getvalue() docx_base64 = base64.b64encode(docx_bytes).decode('utf-8') return docx_base64 except Exception as e: self.logger.error(f"Error generating DOCX: {str(e)}") raise def _setup_document_styles(self, doc): """Set up document styles.""" try: # Set default font style = doc.styles['Normal'] font = style.font font.name = 'Calibri' font.size = Pt(11) # Set heading styles for i in range(1, 4): heading_style = doc.styles[f'Heading {i}'] heading_font = heading_style.font heading_font.name = 'Calibri' heading_font.size = Pt(16 - i * 2) heading_font.bold = True except Exception as e: self.logger.warning(f"Could not set up document styles: {str(e)}") def _process_section(self, doc, lines: list): """Process a section of content into DOCX elements.""" for line in lines: if not line.strip(): continue # Check for tables (lines with |) if '|' in line and not line.startswith('|'): # This might be part of a table, process as table table_data = self._extract_table_data(lines) if table_data: self._add_table(doc, table_data) return # Check for lists if line.startswith('- ') or line.startswith('* '): # This is a list item doc.add_paragraph(line[2:], style='List Bullet') elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')): # This is a numbered list item doc.add_paragraph(line[3:], style='List Number') else: # Regular paragraph doc.add_paragraph(line) def _extract_table_data(self, lines: list) -> list: """Extract table data from lines.""" table_data = [] in_table = False for line in lines: if '|' in line: if not in_table: in_table = True # Split by | and clean up cells = [cell.strip() for cell in line.split('|') if cell.strip()] if cells: table_data.append(cells) elif in_table and not line.strip(): # Empty line, might be end of table break return table_data if len(table_data) > 1 else [] def _add_table(self, doc, table_data: list): """Add a table to the document.""" try: if not table_data: return # Create table table = doc.add_table(rows=len(table_data), cols=len(table_data[0])) table.alignment = WD_TABLE_ALIGNMENT.CENTER # Add data to table for row_idx, row_data in enumerate(table_data): for col_idx, cell_data in enumerate(row_data): if col_idx < len(table.rows[row_idx].cells): table.rows[row_idx].cells[col_idx].text = cell_data # Style the table self._style_table(table) except Exception as e: self.logger.warning(f"Could not add table: {str(e)}") def _style_table(self, table): """Apply styling to the table.""" try: # Style header row if len(table.rows) > 0: header_cells = table.rows[0].cells for cell in header_cells: for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True except Exception as e: self.logger.warning(f"Could not style table: {str(e)}")