gateway/modules/services/serviceGeneration/renderers/docx_renderer.py

"""
DOCX renderer for report generation using python-docx.
"""

from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
import io
import base64
from datetime import datetime, UTC

try:
    from docx import Document
    from docx.shared import Inches, Pt
    from docx.enum.text import WD_ALIGN_PARAGRAPH
    from docx.enum.table import WD_TABLE_ALIGNMENT
    from docx.oxml.shared import OxmlElement, qn
    from docx.oxml.ns import nsdecls
    from docx.oxml import parse_xml
    DOCX_AVAILABLE = True
except ImportError:
    DOCX_AVAILABLE = False

class DocxRenderer(BaseRenderer):
    """Renders content to DOCX format using python-docx."""

    @classmethod
    def get_supported_formats(cls) -> List[str]:
        """Return supported DOCX formats."""
        return ['docx', 'doc']

    @classmethod
    def get_format_aliases(cls) -> List[str]:
        """Return format aliases."""
        return ['word', 'document']

    @classmethod
    def get_priority(cls) -> int:
        """Return priority for DOCX renderer."""
        return 115

    def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
        """Get DOCX-specific extraction prompt."""
        return f"""
{user_prompt}

Generate a comprehensive DOCX report with the title: "{title}"

DOCX FORMAT REQUIREMENTS:
- Create structured content suitable for Word documents
- Use clear headings and sections with proper hierarchy
- Include tables for structured data
- Use bullet points and numbered lists where appropriate
- Include source document information
- Structure content for professional presentation
- Use consistent formatting throughout

DOCX STRUCTURE:
- Title page with report title and generation date
- Table of contents (if multiple sections)
- Executive summary
- Main content sections with clear headings
- Data tables and analysis
- Conclusions and recommendations
- Appendices with source information

FORMATTING RULES:
- Use clear section headings (H1, H2, H3 style)
- Include consistent paragraph formatting
- Use tables with proper alignment and borders
- Use bullet points and numbered lists
- Add source citations and references
- Include generation metadata
- Use professional fonts and spacing

OUTPUT POLICY:
- Return ONLY plain text content suitable for Word document generation
- NO markdown formatting (no **bold**, no # headings, no --- separators)
- NO HTML tags
- NO code blocks
- Use plain text with clear structure
- Use line breaks for separation
- Use indentation for lists
- Use ALL CAPS for major headings
- Use Title Case for subheadings
- Use bullet points with dashes (-) for lists
- Use numbers (1., 2., 3.) for numbered lists
- Professional document format
- Include all necessary information

CRITICAL: Use the actual data from the source documents to create the content. Do not generate placeholder text or templates. Extract and use the real data provided in the source documents to create meaningful content.

Generate the complete DOCX report content using the actual data from the source documents:
"""

    async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
        """Render extracted content to DOCX format."""
        try:
            if not DOCX_AVAILABLE:
                # Fallback to HTML if python-docx not available
                from .html_renderer import HtmlRenderer
                html_renderer = HtmlRenderer()
                html_content, _ = await html_renderer.render(extracted_content, title)
                return html_content, "text/html"

            # Generate DOCX using python-docx
            docx_content = self._generate_docx(extracted_content, title)

            return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"

        except Exception as e:
            self.logger.error(f"Error rendering DOCX: {str(e)}")
            # Return minimal fallback
            return f"DOCX Generation Error: {str(e)}", "text/plain"

    def _generate_docx(self, content: str, title: str) -> str:
        """Generate DOCX content using python-docx."""
        try:
            # Create new document
            doc = Document()

            # Set up document styles
            self._setup_document_styles(doc)

            # Add title
            title_para = doc.add_heading(title, 0)
            title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

            # Add generation date
            date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}")
            date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

            # Add page break
            doc.add_page_break()

            # Process content
            lines = content.split('\n')
            current_section = []

            for line in lines:
                line = line.strip()
                if not line:
                    continue

                # Check for ALL CAPS headings (major headings)
                if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'):
                    if current_section:
                        self._process_section(doc, current_section)
                        current_section = []
                    doc.add_heading(line, level=1)
                # Check for Title Case headings (subheadings)
                elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')):
                    if current_section:
                        self._process_section(doc, current_section)
                        current_section = []
                    doc.add_heading(line, level=2)
                # Check for markdown headings (fallback)
                elif line.startswith('# '):
                    # H1 heading
                    if current_section:
                        self._process_section(doc, current_section)
                        current_section = []
                    doc.add_heading(line[2:], level=1)
                elif line.startswith('## '):
                    # H2 heading
                    if current_section:
                        self._process_section(doc, current_section)
                        current_section = []
                    doc.add_heading(line[3:], level=2)
                elif line.startswith('### '):
                    # H3 heading
                    if current_section:
                        self._process_section(doc, current_section)
                        current_section = []
                    doc.add_heading(line[4:], level=3)
                else:
                    current_section.append(line)

            # Process remaining content
            if current_section:
                self._process_section(doc, current_section)

            # Save to buffer
            buffer = io.BytesIO()
            doc.save(buffer)
            buffer.seek(0)

            # Convert to base64
            docx_bytes = buffer.getvalue()
            docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')

            return docx_base64

        except Exception as e:
            self.logger.error(f"Error generating DOCX: {str(e)}")
            raise

    def _setup_document_styles(self, doc):
        """Set up document styles."""
        try:
            # Set default font
            style = doc.styles['Normal']
            font = style.font
            font.name = 'Calibri'
            font.size = Pt(11)

            # Set heading styles
            for i in range(1, 4):
                heading_style = doc.styles[f'Heading {i}']
                heading_font = heading_style.font
                heading_font.name = 'Calibri'
                heading_font.size = Pt(16 - i * 2)
                heading_font.bold = True
        except Exception as e:
            self.logger.warning(f"Could not set up document styles: {str(e)}")

    def _process_section(self, doc, lines: list):
        """Process a section of content into DOCX elements."""
        for line in lines:
            if not line.strip():
                continue

            # Check for tables (lines with |)
            if '|' in line and not line.startswith('|'):
                # This might be part of a table, process as table
                table_data = self._extract_table_data(lines)
                if table_data:
                    self._add_table(doc, table_data)
                    return

            # Check for lists
            if line.startswith('- ') or line.startswith('* '):
                # This is a list item
                doc.add_paragraph(line[2:], style='List Bullet')
            elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
                # This is a numbered list item
                doc.add_paragraph(line[3:], style='List Number')
            else:
                # Regular paragraph
                doc.add_paragraph(line)

    def _extract_table_data(self, lines: list) -> list:
        """Extract table data from lines."""
        table_data = []
        in_table = False

        for line in lines:
            if '|' in line:
                if not in_table:
                    in_table = True
                # Split by | and clean up
                cells = [cell.strip() for cell in line.split('|') if cell.strip()]
                if cells:
                    table_data.append(cells)
            elif in_table and not line.strip():
                # Empty line, might be end of table
                break

        return table_data if len(table_data) > 1 else []

    def _add_table(self, doc, table_data: list):
        """Add a table to the document."""
        try:
            if not table_data:
                return

            # Create table
            table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
            table.alignment = WD_TABLE_ALIGNMENT.CENTER

            # Add data to table
            for row_idx, row_data in enumerate(table_data):
                for col_idx, cell_data in enumerate(row_data):
                    if col_idx < len(table.rows[row_idx].cells):
                        table.rows[row_idx].cells[col_idx].text = cell_data

            # Style the table
            self._style_table(table)

        except Exception as e:
            self.logger.warning(f"Could not add table: {str(e)}")

    def _style_table(self, table):
        """Apply styling to the table."""
        try:
            # Style header row
            if len(table.rows) > 0:
                header_cells = table.rows[0].cells
                for cell in header_cells:
                    for paragraph in cell.paragraphs:
                        for run in paragraph.runs:
                            run.bold = True
        except Exception as e:
            self.logger.warning(f"Could not style table: {str(e)}")