gateway/modules/services/serviceGeneration/renderers/docx_renderer.py

"""
DOCX renderer for report generation using python-docx.
"""

from .base_renderer import BaseRenderer
from typing import Dict, Any, Tuple, List
import io
import base64
from datetime import datetime, UTC

try:
    from docx import Document
    from docx.shared import Inches, Pt
    from docx.enum.text import WD_ALIGN_PARAGRAPH
    from docx.enum.table import WD_TABLE_ALIGNMENT
    from docx.oxml.shared import OxmlElement, qn
    from docx.oxml.ns import nsdecls
    from docx.oxml import parse_xml
    DOCX_AVAILABLE = True
except ImportError:
    DOCX_AVAILABLE = False

class DocxRenderer(BaseRenderer):
    """Renders content to DOCX format using python-docx."""

    @classmethod
    def get_supported_formats(cls) -> List[str]:
        """Return supported DOCX formats."""
        return ['docx', 'doc']

    @classmethod
    def get_format_aliases(cls) -> List[str]:
        """Return format aliases."""
        return ['word', 'document']

    @classmethod
    def get_priority(cls) -> int:
        """Return priority for DOCX renderer."""
        return 115

    def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
        """Return only DOCX-specific guidelines; global prompt is built centrally."""
        return (
            "DOCX FORMAT GUIDELINES:\n"
            "- Provide plain text content suitable for Word generation (no markdown/HTML).\n"
            "- Use clear section hierarchy; bullet and numbered lists where needed.\n"
            "- Include tables as simple pipe-delimited lines if tabular data is needed.\n"
            "OUTPUT: Return ONLY the structured plain text to be converted into DOCX."
        )

    async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
        """Render extracted content to DOCX format."""
        try:
            if not DOCX_AVAILABLE:
                # Fallback to HTML if python-docx not available
                from .html_renderer import HtmlRenderer
                html_renderer = HtmlRenderer()
                html_content, _ = await html_renderer.render(extracted_content, title)
                return html_content, "text/html"

            # Generate DOCX using python-docx
            docx_content = self._generate_docx(extracted_content, title)

            return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"

        except Exception as e:
            self.logger.error(f"Error rendering DOCX: {str(e)}")
            # Return minimal fallback
            return f"DOCX Generation Error: {str(e)}", "text/plain"

    def _generate_docx(self, content: str, title: str) -> str:
        """Generate DOCX content using python-docx."""
        try:
            # Create new document
            doc = Document()

            # Set up document styles
            self._setup_document_styles(doc)

            # Add title
            title_para = doc.add_heading(title, 0)
            title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

            # Add generation date
            date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}")
            date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

            # Add page break
            doc.add_page_break()

            # Process content
            lines = content.split('\n')
            current_section = []

            for line in lines:
                line = line.strip()
                if not line:
                    continue

                # Check for ALL CAPS headings (major headings)
                if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'):
                    if current_section:
                        self._process_section(doc, current_section)
                        current_section = []
                    doc.add_heading(line, level=1)
                # Check for Title Case headings (subheadings)
                elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')):
                    if current_section:
                        self._process_section(doc, current_section)
                        current_section = []
                    doc.add_heading(line, level=2)
                # Check for markdown headings (fallback)
                elif line.startswith('# '):
                    # H1 heading
                    if current_section:
                        self._process_section(doc, current_section)
                        current_section = []
                    doc.add_heading(line[2:], level=1)
                elif line.startswith('## '):
                    # H2 heading
                    if current_section:
                        self._process_section(doc, current_section)
                        current_section = []
                    doc.add_heading(line[3:], level=2)
                elif line.startswith('### '):
                    # H3 heading
                    if current_section:
                        self._process_section(doc, current_section)
                        current_section = []
                    doc.add_heading(line[4:], level=3)
                else:
                    current_section.append(line)

            # Process remaining content
            if current_section:
                self._process_section(doc, current_section)

            # Save to buffer
            buffer = io.BytesIO()
            doc.save(buffer)
            buffer.seek(0)

            # Convert to base64
            docx_bytes = buffer.getvalue()
            docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')

            return docx_base64

        except Exception as e:
            self.logger.error(f"Error generating DOCX: {str(e)}")
            raise

    def _setup_document_styles(self, doc):
        """Set up document styles."""
        try:
            # Set default font
            style = doc.styles['Normal']
            font = style.font
            font.name = 'Calibri'
            font.size = Pt(11)

            # Set heading styles
            for i in range(1, 4):
                heading_style = doc.styles[f'Heading {i}']
                heading_font = heading_style.font
                heading_font.name = 'Calibri'
                heading_font.size = Pt(16 - i * 2)
                heading_font.bold = True
        except Exception as e:
            self.logger.warning(f"Could not set up document styles: {str(e)}")

    def _process_section(self, doc, lines: list):
        """Process a section of content into DOCX elements."""
        for line in lines:
            if not line.strip():
                continue

            # Check for tables (lines with |)
            if '|' in line and not line.startswith('|'):
                # This might be part of a table, process as table
                table_data = self._extract_table_data(lines)
                if table_data:
                    self._add_table(doc, table_data)
                    return

            # Check for lists
            if line.startswith('- ') or line.startswith('* '):
                # This is a list item
                doc.add_paragraph(line[2:], style='List Bullet')
            elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
                # This is a numbered list item
                doc.add_paragraph(line[3:], style='List Number')
            else:
                # Regular paragraph
                doc.add_paragraph(line)

    def _extract_table_data(self, lines: list) -> list:
        """Extract table data from lines."""
        table_data = []
        in_table = False

        for line in lines:
            if '|' in line:
                if not in_table:
                    in_table = True
                # Split by | and clean up
                cells = [cell.strip() for cell in line.split('|') if cell.strip()]
                if cells:
                    table_data.append(cells)
            elif in_table and not line.strip():
                # Empty line, might be end of table
                break

        return table_data if len(table_data) > 1 else []

    def _add_table(self, doc, table_data: list):
        """Add a table to the document."""
        try:
            if not table_data:
                return

            # Create table
            table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
            table.alignment = WD_TABLE_ALIGNMENT.CENTER

            # Add data to table
            for row_idx, row_data in enumerate(table_data):
                for col_idx, cell_data in enumerate(row_data):
                    if col_idx < len(table.rows[row_idx].cells):
                        table.rows[row_idx].cells[col_idx].text = cell_data

            # Style the table
            self._style_table(table)

        except Exception as e:
            self.logger.warning(f"Could not add table: {str(e)}")

    def _style_table(self, table):
        """Apply styling to the table."""
        try:
            # Style header row
            if len(table.rows) > 0:
                header_cells = table.rows[0].cells
                for cell in header_cells:
                    for paragraph in cell.paragraphs:
                        for run in paragraph.runs:
                            run.bold = True
        except Exception as e:
            self.logger.warning(f"Could not style table: {str(e)}")