""" DOCX renderer for report generation using python-docx. """ from .base_renderer import BaseRenderer from typing import Dict, Any, Tuple, List import io import base64 from datetime import datetime, UTC try: from docx import Document from docx.shared import Inches, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT from docx.oxml.shared import OxmlElement, qn from docx.oxml.ns import nsdecls from docx.oxml import parse_xml DOCX_AVAILABLE = True except ImportError: DOCX_AVAILABLE = False class DocxRenderer(BaseRenderer): """Renders content to DOCX format using python-docx.""" @classmethod def get_supported_formats(cls) -> List[str]: """Return supported DOCX formats.""" return ['docx', 'doc'] @classmethod def get_format_aliases(cls) -> List[str]: """Return format aliases.""" return ['word', 'document'] @classmethod def get_priority(cls) -> int: """Return priority for DOCX renderer.""" return 115 def getExtractionPrompt(self, user_prompt: str, title: str) -> str: """Get DOCX-specific extraction prompt.""" return f""" {user_prompt} Generate a comprehensive DOCX report with the title: "{title}" DOCX FORMAT REQUIREMENTS: - Create structured content suitable for Word documents - Use clear headings and sections with proper hierarchy - Include tables for structured data - Use bullet points and numbered lists where appropriate - Include source document information - Structure content for professional presentation - Use consistent formatting throughout DOCX STRUCTURE: - Title page with report title and generation date - Table of contents (if multiple sections) - Executive summary - Main content sections with clear headings - Data tables and analysis - Conclusions and recommendations - Appendices with source information FORMATTING RULES: - Use clear section headings (H1, H2, H3 style) - Include consistent paragraph formatting - Use tables with proper alignment and borders - Use bullet points and numbered lists - Add source citations and references - Include generation metadata - Use professional fonts and spacing OUTPUT POLICY: - Return ONLY plain text content suitable for Word document generation - NO markdown formatting (no **bold**, no # headings, no --- separators) - NO HTML tags - NO code blocks - Use plain text with clear structure - Use line breaks for separation - Use indentation for lists - Use ALL CAPS for major headings - Use Title Case for subheadings - Use bullet points with dashes (-) for lists - Use numbers (1., 2., 3.) for numbered lists - Professional document format - Include all necessary information CRITICAL: Use the actual data from the source documents to create the content. Do not generate placeholder text or templates. Extract and use the real data provided in the source documents to create meaningful content. Generate the complete DOCX report content using the actual data from the source documents: """ async def render(self, extracted_content: str, title: str) -> Tuple[str, str]: """Render extracted content to DOCX format.""" try: if not DOCX_AVAILABLE: # Fallback to HTML if python-docx not available from .html_renderer import HtmlRenderer html_renderer = HtmlRenderer() html_content, _ = await html_renderer.render(extracted_content, title) return html_content, "text/html" # Generate DOCX using python-docx docx_content = self._generate_docx(extracted_content, title) return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" except Exception as e: self.logger.error(f"Error rendering DOCX: {str(e)}") # Return minimal fallback return f"DOCX Generation Error: {str(e)}", "text/plain" def _generate_docx(self, content: str, title: str) -> str: """Generate DOCX content using python-docx.""" try: # Create new document doc = Document() # Set up document styles self._setup_document_styles(doc) # Add title title_para = doc.add_heading(title, 0) title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER # Add generation date date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}") date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER # Add page break doc.add_page_break() # Process content lines = content.split('\n') current_section = [] for line in lines: line = line.strip() if not line: continue # Check for ALL CAPS headings (major headings) if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'): if current_section: self._process_section(doc, current_section) current_section = [] doc.add_heading(line, level=1) # Check for Title Case headings (subheadings) elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')): if current_section: self._process_section(doc, current_section) current_section = [] doc.add_heading(line, level=2) # Check for markdown headings (fallback) elif line.startswith('# '): # H1 heading if current_section: self._process_section(doc, current_section) current_section = [] doc.add_heading(line[2:], level=1) elif line.startswith('## '): # H2 heading if current_section: self._process_section(doc, current_section) current_section = [] doc.add_heading(line[3:], level=2) elif line.startswith('### '): # H3 heading if current_section: self._process_section(doc, current_section) current_section = [] doc.add_heading(line[4:], level=3) else: current_section.append(line) # Process remaining content if current_section: self._process_section(doc, current_section) # Save to buffer buffer = io.BytesIO() doc.save(buffer) buffer.seek(0) # Convert to base64 docx_bytes = buffer.getvalue() docx_base64 = base64.b64encode(docx_bytes).decode('utf-8') return docx_base64 except Exception as e: self.logger.error(f"Error generating DOCX: {str(e)}") raise def _setup_document_styles(self, doc): """Set up document styles.""" try: # Set default font style = doc.styles['Normal'] font = style.font font.name = 'Calibri' font.size = Pt(11) # Set heading styles for i in range(1, 4): heading_style = doc.styles[f'Heading {i}'] heading_font = heading_style.font heading_font.name = 'Calibri' heading_font.size = Pt(16 - i * 2) heading_font.bold = True except Exception as e: self.logger.warning(f"Could not set up document styles: {str(e)}") def _process_section(self, doc, lines: list): """Process a section of content into DOCX elements.""" for line in lines: if not line.strip(): continue # Check for tables (lines with |) if '|' in line and not line.startswith('|'): # This might be part of a table, process as table table_data = self._extract_table_data(lines) if table_data: self._add_table(doc, table_data) return # Check for lists if line.startswith('- ') or line.startswith('* '): # This is a list item doc.add_paragraph(line[2:], style='List Bullet') elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')): # This is a numbered list item doc.add_paragraph(line[3:], style='List Number') else: # Regular paragraph doc.add_paragraph(line) def _extract_table_data(self, lines: list) -> list: """Extract table data from lines.""" table_data = [] in_table = False for line in lines: if '|' in line: if not in_table: in_table = True # Split by | and clean up cells = [cell.strip() for cell in line.split('|') if cell.strip()] if cells: table_data.append(cells) elif in_table and not line.strip(): # Empty line, might be end of table break return table_data if len(table_data) > 1 else [] def _add_table(self, doc, table_data: list): """Add a table to the document.""" try: if not table_data: return # Create table table = doc.add_table(rows=len(table_data), cols=len(table_data[0])) table.alignment = WD_TABLE_ALIGNMENT.CENTER # Add data to table for row_idx, row_data in enumerate(table_data): for col_idx, cell_data in enumerate(row_data): if col_idx < len(table.rows[row_idx].cells): table.rows[row_idx].cells[col_idx].text = cell_data # Style the table self._style_table(table) except Exception as e: self.logger.warning(f"Could not add table: {str(e)}") def _style_table(self, table): """Apply styling to the table.""" try: # Style header row if len(table.rows) > 0: header_cells = table.rows[0].cells for cell in header_cells: for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True except Exception as e: self.logger.warning(f"Could not style table: {str(e)}")