# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ DOCX renderer for report generation using python-docx. """ from .rendererBaseTemplate import BaseRenderer from typing import Dict, Any, Tuple, List import io import base64 import re import csv try: from docx import Document from docx.shared import Inches, Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT DOCX_AVAILABLE = True except ImportError: DOCX_AVAILABLE = False class RendererDocx(BaseRenderer): """Renders content to DOCX format using python-docx.""" @classmethod def getSupportedFormats(cls) -> List[str]: """Return supported DOCX formats.""" return ['docx', 'doc'] @classmethod def getFormatAliases(cls) -> List[str]: """Return format aliases.""" return ['word', 'document'] @classmethod def getPriority(cls) -> int: """Return priority for DOCX renderer.""" return 115 async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> Tuple[str, str]: """Render extracted JSON content to DOCX format using AI-analyzed styling.""" self.services.utils.debugLogToFile(f"DOCX RENDER CALLED: title={title}, user_prompt={userPrompt[:50] if userPrompt else 'None'}...", "DOCX_RENDERER") try: if not DOCX_AVAILABLE: # Fallback to HTML if python-docx not available from .rendererHtml import RendererHtml htmlRenderer = RendererHtml() htmlContent, _ = await htmlRenderer.render(extractedContent, title) return htmlContent, "text/html" # Generate DOCX using AI-analyzed styling docx_content = await self._generateDocxFromJson(extractedContent, title, userPrompt, aiService) return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" except Exception as e: self.logger.error(f"Error rendering DOCX: {str(e)}") # Return minimal fallback return f"DOCX Generation Error: {str(e)}", "text/plain" async def _generateDocxFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str: """Generate DOCX content from structured JSON document.""" try: # Create new document doc = Document() # Get style set: default styles, enhanced with AI if style instructions present styleSet = await self._getStyleSet(userPrompt, aiService) # Setup basic document styles and create all styles from style set self._setupBasicDocumentStyles(doc) self._setupDocumentStyles(doc, styleSet) # Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]}) if not self._validateJsonStructure(json_content): raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}") # Extract sections and metadata from standardized schema sections = self._extractSections(json_content) metadata = self._extractMetadata(json_content) # Use title from JSON metadata if available, otherwise use provided title document_title = metadata.get("title", title) # Add document title using Title style if document_title: doc.add_paragraph(document_title, style='Title') # Process each section in order for section in sections: self._renderJsonSection(doc, section, styleSet) # Save to buffer buffer = io.BytesIO() doc.save(buffer) buffer.seek(0) # Convert to base64 docx_bytes = buffer.getvalue() docx_base64 = base64.b64encode(docx_bytes).decode('utf-8') return docx_base64 except Exception as e: self.logger.error(f"Error generating DOCX from JSON: {str(e)}") raise Exception(f"DOCX generation failed: {str(e)}") async def _getStyleSet(self, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]: """Get style set - default styles, enhanced with AI if userPrompt provided. Args: userPrompt: User's prompt (AI will detect style instructions in any language) aiService: AI service (used only if userPrompt provided) templateName: Name of template style set (None = default) Returns: Dict with style definitions for all document styles """ # Get default style set if templateName == "corporate": defaultStyleSet = self._getCorporateStyleSet() elif templateName == "minimal": defaultStyleSet = self._getMinimalStyleSet() else: defaultStyleSet = self._getDefaultStyleSet() # Enhance with AI if userPrompt provided (AI handles multilingual style detection) if userPrompt and aiService: # AI will naturally detect style instructions in any language self.logger.info(f"Enhancing styles with AI based on user prompt...") enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService) return self._validateStylesContrast(enhancedStyleSet) else: # Use default styles only return defaultStyleSet async def _enhanceStylesWithAI(self, userPrompt: str, defaultStyleSet: Dict[str, Any], aiService) -> Dict[str, Any]: """Enhance default styles with AI based on user prompt.""" try: style_template = self._createAiStyleTemplate("docx", userPrompt, defaultStyleSet) enhanced_styles = await self._getAiStyles(aiService, style_template, defaultStyleSet) return enhanced_styles except Exception as e: self.logger.warning(f"AI style enhancement failed: {str(e)}, using default styles") return defaultStyleSet def _validateStylesContrast(self, styles: Dict[str, Any]) -> Dict[str, Any]: """Validate and fix contrast issues in AI-generated styles.""" try: # Fix table header contrast if "table_header" in styles: header = styles["table_header"] bg_color = header.get("background", "#FFFFFF") text_color = header.get("text_color", "#000000") # If both are white or both are dark, fix it if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF": header["background"] = "#4F4F4F" header["text_color"] = "#FFFFFF" elif bg_color.upper() == "#000000" and text_color.upper() == "#000000": header["background"] = "#4F4F4F" header["text_color"] = "#FFFFFF" # Fix table cell contrast if "table_cell" in styles: cell = styles["table_cell"] bg_color = cell.get("background", "#FFFFFF") text_color = cell.get("text_color", "#000000") # If both are white or both are dark, fix it if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF": cell["background"] = "#FFFFFF" cell["text_color"] = "#2F2F2F" elif bg_color.upper() == "#000000" and text_color.upper() == "#000000": cell["background"] = "#FFFFFF" cell["text_color"] = "#2F2F2F" return styles except Exception as e: self.logger.warning(f"Style validation failed: {str(e)}") return self._getDefaultStyleSet() def _getDefaultStyleSet(self) -> Dict[str, Any]: """Default DOCX style set - used when no style instructions present.""" return { "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"}, "heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"}, "heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"}, "paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"}, "table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center"}, "table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left"}, "table_border": {"style": "horizontal_only", "color": "#000000", "thickness": "thin"}, "bullet_list": {"font_size": 11, "color": "#2F2F2F", "indent": 20}, "code_block": {"font": "Courier New", "font_size": 10, "color": "#2F2F2F", "background": "#F5F5F5"} } def _setupBasicDocumentStyles(self, doc: Document) -> None: """Set up basic document styles.""" try: # Set default font style = doc.styles['Normal'] font = style.font font.name = 'Calibri' font.size = Pt(11) except Exception as e: self.logger.warning(f"Could not set up basic document styles: {str(e)}") def _clearTemplateContent(self, doc: Document) -> None: """Clear template content while preserving styles.""" try: # Remove all paragraphs except keep the styles for paragraph in list(doc.paragraphs): # Keep the paragraph but clear its content paragraph.clear() # Remove all tables for table in list(doc.tables): table._element.getparent().remove(table._element) except Exception as e: self.logger.warning(f"Could not clear template content: {str(e)}") def _renderJsonSection(self, doc: Document, section: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a single JSON section to DOCX using AI-generated styles. Supports three content formats: reference, object (base64), extracted_text. """ try: section_type = section.get("content_type", "paragraph") elements = section.get("elements", []) # Process each element in the section for element in elements: element_type = element.get("type", "") # Support three content formats from Phase 5D if element_type == "reference": # Document reference format doc_ref = element.get("documentReference", "") label = element.get("label", "Reference") para = doc.add_paragraph(f"[Reference: {label}]") para.runs[0].italic = True continue elif element_type == "extracted_text": # Extracted text format - render as paragraph content = element.get("content", "") source = element.get("source", "") if content: para = doc.add_paragraph(content) if source: para.add_run(f" (Source: {source})").italic = True continue # Standard section types if section_type == "table": self._renderJsonTable(doc, element, styles) elif section_type == "bullet_list": self._renderJsonBulletList(doc, element, styles) elif section_type == "heading": self._renderJsonHeading(doc, element, styles) elif section_type == "paragraph": self._renderJsonParagraph(doc, element, styles) elif section_type == "code_block": self._renderJsonCodeBlock(doc, element, styles) elif section_type == "image": self._renderJsonImage(doc, element, styles) else: # Fallback to paragraph for unknown types self._renderJsonParagraph(doc, element, styles) except Exception as e: self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}") # Add error paragraph as fallback error_para = doc.add_paragraph(f"[Error rendering section: {str(e)}]") def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON table to DOCX using AI-generated styles.""" try: headers = table_data.get("headers", []) rows = table_data.get("rows", []) if not headers or not rows: return # Create table table = doc.add_table(rows=len(rows) + 1, cols=len(headers)) table.alignment = WD_TABLE_ALIGNMENT.CENTER # Apply table borders based on AI style border_style = styles["table_border"]["style"] if border_style == "horizontal_only": self._applyHorizontalBordersOnly(table) elif border_style == "grid": table.style = 'Table Grid' # else: no borders # Add headers with AI-generated styling header_row = table.rows[0] header_style = styles["table_header"] for i, header in enumerate(headers): if i < len(header_row.cells): cell = header_row.cells[i] cell.text = str(header) # Apply background color bg_color = header_style["background"].lstrip('#') self._setCellBackground(cell, RGBColor(int(bg_color[0:2], 16), int(bg_color[2:4], 16), int(bg_color[4:6], 16))) # Apply text styling for paragraph in cell.paragraphs: paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER if header_style["align"] == "center" else WD_ALIGN_PARAGRAPH.LEFT for run in paragraph.runs: run.bold = header_style["bold"] run.font.size = Pt(11) text_color = header_style["text_color"].lstrip('#') run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16)) # Add data rows with AI-generated styling cell_style = styles["table_cell"] for row_idx, row_data in enumerate(rows): if row_idx + 1 < len(table.rows): table_row = table.rows[row_idx + 1] for col_idx, cell_data in enumerate(row_data): if col_idx < len(table_row.cells): cell = table_row.cells[col_idx] cell.text = str(cell_data) # Apply text styling for paragraph in cell.paragraphs: paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT for run in paragraph.runs: run.font.size = Pt(10) text_color = cell_style["text_color"].lstrip('#') run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16)) except Exception as e: self.logger.warning(f"Error rendering table: {str(e)}") def _applyHorizontalBordersOnly(self, table) -> None: """Apply only horizontal borders to the table (no vertical borders).""" try: from docx.oxml.shared import OxmlElement, qn # Get table properties tbl_pr = table._element.find(qn('w:tblPr')) if tbl_pr is None: tbl_pr = OxmlElement('w:tblPr') table._element.insert(0, tbl_pr) # Remove existing borders existing_borders = tbl_pr.find(qn('w:tblBorders')) if existing_borders is not None: tbl_pr.remove(existing_borders) # Create new borders element tbl_borders = OxmlElement('w:tblBorders') # Top border top_border = OxmlElement('w:top') top_border.set(qn('w:val'), 'single') top_border.set(qn('w:sz'), '4') top_border.set(qn('w:space'), '0') top_border.set(qn('w:color'), '000000') tbl_borders.append(top_border) # Bottom border bottom_border = OxmlElement('w:bottom') bottom_border.set(qn('w:val'), 'single') bottom_border.set(qn('w:sz'), '4') bottom_border.set(qn('w:space'), '0') bottom_border.set(qn('w:color'), '000000') tbl_borders.append(bottom_border) # Left border - none left_border = OxmlElement('w:left') left_border.set(qn('w:val'), 'none') tbl_borders.append(left_border) # Right border - none right_border = OxmlElement('w:right') right_border.set(qn('w:val'), 'none') tbl_borders.append(right_border) # Inside horizontal border inside_h_border = OxmlElement('w:insideH') inside_h_border.set(qn('w:val'), 'single') inside_h_border.set(qn('w:sz'), '4') inside_h_border.set(qn('w:space'), '0') inside_h_border.set(qn('w:color'), '000000') tbl_borders.append(inside_h_border) # Inside vertical border - none inside_v_border = OxmlElement('w:insideV') inside_v_border.set(qn('w:val'), 'none') tbl_borders.append(inside_v_border) tbl_pr.append(tbl_borders) except Exception as e: self.logger.warning(f"Could not apply horizontal borders: {str(e)}") def _setCellBackground(self, cell, color: RGBColor) -> None: """Set the background color of a table cell.""" try: from docx.oxml.shared import OxmlElement, qn # Get cell properties tc_pr = cell._element.find(qn('w:tcPr')) if tc_pr is None: tc_pr = OxmlElement('w:tcPr') cell._element.insert(0, tc_pr) # Remove existing shading existing_shading = tc_pr.find(qn('w:shd')) if existing_shading is not None: tc_pr.remove(existing_shading) # Create new shading element shading = OxmlElement('w:shd') shading.set(qn('w:val'), 'clear') shading.set(qn('w:color'), 'auto') # Convert RGBColor to hex string by unpacking RGB components red, green, blue = color hex_color = f"{red:02x}{green:02x}{blue:02x}" shading.set(qn('w:fill'), hex_color) tc_pr.append(shading) except Exception as e: self.logger.warning(f"Could not set cell background: {str(e)}") def _renderJsonBulletList(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON bullet list to DOCX using AI-generated styles.""" try: items = list_data.get("items", []) bullet_style = styles["bullet_list"] for item in items: if isinstance(item, str): para = doc.add_paragraph(item, style='List Bullet') elif isinstance(item, dict) and "text" in item: para = doc.add_paragraph(item["text"], style='List Bullet') except Exception as e: self.logger.warning(f"Error rendering bullet list: {str(e)}") def _renderJsonHeading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON heading to DOCX using AI-generated styles.""" try: level = heading_data.get("level", 1) text = heading_data.get("text", "") if text: level = max(1, min(6, level)) doc.add_heading(text, level=level) except Exception as e: self.logger.warning(f"Error rendering heading: {str(e)}") def _renderJsonParagraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON paragraph to DOCX using AI-generated styles.""" try: text = paragraph_data.get("text", "") if text: para = doc.add_paragraph(text) except Exception as e: self.logger.warning(f"Error rendering paragraph: {str(e)}") def _renderJsonCodeBlock(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON code block to DOCX using AI-generated styles.""" try: code = code_data.get("code", "") language = code_data.get("language", "") if code: if language: lang_para = doc.add_paragraph(f"Code ({language}):") lang_para.runs[0].bold = True code_para = doc.add_paragraph(code) for run in code_para.runs: run.font.name = 'Courier New' run.font.size = Pt(10) except Exception as e: self.logger.warning(f"Error rendering code block: {str(e)}") def _renderJsonImage(self, doc: Document, image_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON image to DOCX.""" try: base64_data = image_data.get("base64Data", "") alt_text = image_data.get("altText", "Image") if base64_data: image_bytes = base64.b64decode(base64_data) doc.add_picture(io.BytesIO(image_bytes), width=Inches(4)) if alt_text: caption_para = doc.add_paragraph(f"Figure: {alt_text}") caption_para.runs[0].italic = True except Exception as e: self.logger.warning(f"Error rendering image: {str(e)}") doc.add_paragraph(f"[Image: {image_data.get('altText', 'Image')}]") def _extractStructureFromPrompt(self, userPrompt: str, title: str) -> Dict[str, Any]: """Extract document structure from user prompt.""" structure = { 'title': title, 'sections': [], 'format': 'standard' } if not userPrompt: return structure # Extract title from prompt if not provided if not title or title == "Generated Document": # Look for "create a ... document" or "generate a ... report" title_match = re.search(r'(?:create|generate|make)\s+a\s+([^,]+?)(?:\s+document|\s+report|\s+summary)', userPrompt.lower()) if title_match: structure['title'] = title_match.group(1).strip().title() # Extract sections from numbered lists in prompt section_pattern = r'(\d+)\)?\s*([^,]+?)(?:\s*[,:]|\s*$)' sections = re.findall(section_pattern, userPrompt) for num, section_text in sections: structure['sections'].append({ 'number': int(num), 'title': section_text.strip(), 'level': 2 # H2 level }) # If no numbered sections found, try to extract from "including:" patterns if not structure['sections']: including_match = re.search(r'including:\s*(.+?)(?:\.|$)', userPrompt, re.DOTALL) if including_match: including_text = including_match.group(1) # Split by common separators parts = re.split(r'[,;]\s*', including_text) for i, part in enumerate(parts, 1): part = part.strip() if part: structure['sections'].append({ 'number': i, 'title': part, 'level': 2 }) # If still no sections, extract from any list-like patterns if not structure['sections']: # Look for bullet points or dashes bullet_pattern = r'[-•]\s*([^,\n]+?)(?:\s*[,:]|\s*$)' bullets = re.findall(bullet_pattern, userPrompt) for i, bullet in enumerate(bullets, 1): bullet = bullet.strip() if bullet and len(bullet) > 3: structure['sections'].append({ 'number': i, 'title': bullet, 'level': 2 }) # If still no sections, extract from sentence structure if not structure['sections']: # Split prompt into sentences and use as sections sentences = re.split(r'[.!?]\s+', userPrompt) for i, sentence in enumerate(sentences[:5], 1): # Max 5 sections sentence = sentence.strip() if sentence and len(sentence) > 10 and not sentence.startswith(('Analyze', 'Create', 'Generate')): structure['sections'].append({ 'number': i, 'title': sentence[:50] + "..." if len(sentence) > 50 else sentence, 'level': 2 }) # Final fallback: create sections from prompt keywords if not structure['sections']: # Extract key action words from prompt action_words = ['analyze', 'summarize', 'review', 'assess', 'evaluate', 'examine', 'investigate'] found_actions = [] for action in action_words: if action in userPrompt.lower(): found_actions.append(action.title()) if found_actions: for i, action in enumerate(found_actions[:3], 1): structure['sections'].append({ 'number': i, 'title': f"{action} Document Content", 'level': 2 }) else: # Last resort: generic but meaningful sections structure['sections'] = [ {'number': 1, 'title': 'Document Analysis', 'level': 2}, {'number': 2, 'title': 'Key Information', 'level': 2}, {'number': 3, 'title': 'Summary and Conclusions', 'level': 2} ] return structure def _generateFromStructure(self, doc, content: str, structure: Dict[str, Any]): """Generate DOCX content based on extracted structure.""" # Add sections based on prompt structure for section in structure['sections']: # Add section heading doc.add_heading(f"{section['number']}) {section['title']}", level=section['level']) # Add AI-generated content for this section # Try to extract relevant content for this section from the AI response section_content = self._extractSectionContent(content, section['title']) if section_content: doc.add_paragraph(section_content) else: # If no specific content found, add a note doc.add_paragraph(f"Content for {section['title']} based on document analysis.") # Add some spacing doc.add_paragraph() # Add the complete AI-generated content as additional analysis if content and content.strip(): doc.add_heading("Complete Analysis", level=1) doc.add_paragraph(content) def _extractSectionContent(self, content: str, section_title: str) -> str: """Extract relevant content for a specific section from AI response.""" if not content or not section_title: return "" # Look for content that matches the section title section_keywords = section_title.lower().split() # Split content into paragraphs paragraphs = content.split('\n\n') relevant_paragraphs = [] for paragraph in paragraphs: paragraph_lower = paragraph.lower() # Check if paragraph contains keywords from section title if any(keyword in paragraph_lower for keyword in section_keywords if len(keyword) > 3): relevant_paragraphs.append(paragraph.strip()) if relevant_paragraphs: return '\n\n'.join(relevant_paragraphs[:2]) # Max 2 paragraphs per section return "" def _setupDocumentStyles(self, doc: Document, styleSet: Dict[str, Any]) -> None: """Create all styles in document from style set. Creates styles BEFORE rendering so they're available for use. """ try: from docx.enum.style import WD_STYLE_TYPE # Create Title style if "title" in styleSet: self._createStyle(doc, "Title", styleSet["title"], WD_STYLE_TYPE.PARAGRAPH) # Create Heading styles (Heading 1, Heading 2) if "heading1" in styleSet: self._createStyle(doc, "Heading 1", styleSet["heading1"], WD_STYLE_TYPE.PARAGRAPH) if "heading2" in styleSet: self._createStyle(doc, "Heading 2", styleSet["heading2"], WD_STYLE_TYPE.PARAGRAPH) # Note: List Bullet and List Number are built-in Word styles, no need to create except Exception as e: self.logger.warning(f"Could not set up document styles: {str(e)}") def _createStyle(self, doc: Document, styleName: str, styleConfig: Dict[str, Any], styleType) -> None: """Create or update a style in the document styles collection.""" try: from docx.enum.style import WD_STYLE_TYPE # Try to get existing style, or create new one try: doc_style = doc.styles[styleName] except KeyError: # Create new style based on Normal doc_style = doc.styles.add_style(styleName, styleType) # Base it on Normal style doc_style.base_style = doc.styles['Normal'] # Apply font configuration font = doc_style.font if "font_size" in styleConfig: font.size = Pt(styleConfig["font_size"]) if "bold" in styleConfig: font.bold = styleConfig["bold"] if "color" in styleConfig: color_hex = styleConfig["color"].lstrip('#') font.color.rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16)) if "font" in styleConfig: font.name = styleConfig["font"] # Set paragraph formatting for alignment if "align" in styleConfig: para_format = doc_style.paragraph_format align = styleConfig["align"] if align == "center": para_format.alignment = WD_ALIGN_PARAGRAPH.CENTER elif align == "right": para_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: para_format.alignment = WD_ALIGN_PARAGRAPH.LEFT except Exception as e: self.logger.warning(f"Could not create style '{styleName}': {str(e)}") def _processSection(self, doc, lines: list): """Process a section of content into DOCX elements.""" for line in lines: if not line.strip(): continue # Check for tables (lines with |) if '|' in line and not line.startswith('|'): # This might be part of a table, process as table table_data = self._extractTableData(lines) if table_data: self._addTable(doc, table_data) return # Check for lists if line.startswith('- ') or line.startswith('* '): # This is a list item doc.add_paragraph(line[2:], style='List Bullet') elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')): # This is a numbered list item doc.add_paragraph(line[3:], style='List Number') else: # Regular paragraph doc.add_paragraph(line) def _extractTableData(self, lines: list) -> list: """Extract table data from lines.""" table_data = [] in_table = False for line in lines: if '|' in line: if not in_table: in_table = True # Split by | and clean up cells = [cell.strip() for cell in line.split('|') if cell.strip()] if cells: table_data.append(cells) elif in_table and not line.strip(): # Empty line, might be end of table break return table_data if len(table_data) > 1 else [] def _addTable(self, doc, table_data: list): """Add a table to the document.""" try: if not table_data: return # Create table table = doc.add_table(rows=len(table_data), cols=len(table_data[0])) table.alignment = WD_TABLE_ALIGNMENT.CENTER # Add data to table for row_idx, row_data in enumerate(table_data): for col_idx, cell_data in enumerate(row_data): if col_idx < len(table.rows[row_idx].cells): table.rows[row_idx].cells[col_idx].text = cell_data # Style the table self._styleTable(table) except Exception as e: self.logger.warning(f"Could not add table: {str(e)}") def _styleTable(self, table): """Apply styling to the table.""" try: # Style header row if len(table.rows) > 0: header_cells = table.rows[0].cells for cell in header_cells: for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True except Exception as e: self.logger.warning(f"Could not style table: {str(e)}") def _processTableRow(self, doc, line: str): """Process a table row and add it to the document.""" if not line.strip(): return # Split by pipe separator parts = [part.strip() for part in line.split('|')] if len(parts) >= 2: # This is a table row - create a table if it doesn't exist if not hasattr(self, '_current_table') or self._current_table is None: # Create new table self._current_table = doc.add_table(rows=1, cols=len(parts)) self._current_table.style = 'Table Grid' # Add header row for i, part in enumerate(parts): if i < len(self._current_table.rows[0].cells): cell = self._current_table.rows[0].cells[i] cell.text = part # Make header bold for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True else: # Add data row to existing table row = self._current_table.add_row() for i, part in enumerate(parts): if i < len(row.cells): row.cells[i].text = part else: # Not a table row, treat as regular text doc.add_paragraph(line) def _cleanAiContent(self, content: str) -> str: """Clean AI-generated content by removing debug information and duplicates.""" if not content: return "" # Remove debug information lines = content.split('\n') clean_lines = [] for line in lines: # Skip debug lines and separators if (line.startswith('[Skipped ') or line.startswith('=== DOCUMENT:') or line.startswith('---') or line.startswith('FILENAME:') or line.strip() == '' or line.strip() == '---'): continue clean_lines.append(line) # Join lines and remove duplicate content clean_content = '\n'.join(clean_lines) # Remove duplicate sections by keeping only the first occurrence sections = clean_content.split('\n\n') seen_sections = set() unique_sections = [] for section in sections: section_key = section.strip()[:50] # Use first 50 chars as key if section_key not in seen_sections and section.strip(): seen_sections.add(section_key) unique_sections.append(section) return '\n\n'.join(unique_sections) def _processTables(self, doc, content: str) -> str: """ Process tables in the content (both CSV and pipe-separated) and convert them to Word tables. Returns the content with tables replaced by placeholders. """ # csv is already imported at module level lines = content.split('\n') processed_lines = [] i = 0 while i < len(lines): line = lines[i].strip() # Check if this line looks like a table (contains pipes or commas with multiple fields) is_pipe_table = '|' in line and len(line.split('|')) >= 2 is_csv_table = ',' in line and len(line.split(',')) >= 2 if is_pipe_table or is_csv_table: # Collect consecutive table lines table_lines = [] j = i # Determine separator and collect lines separator = '|' if is_pipe_table else ',' while j < len(lines): current_line = lines[j].strip() if separator in current_line and len(current_line.split(separator)) >= 2: table_lines.append(current_line) j += 1 else: break if len(table_lines) >= 2: # At least header + 1 data row # Create Word table try: if separator == '|': # Process pipe-separated table rows = [] for table_line in table_lines: # Split by pipe and clean up cells = [cell.strip() for cell in table_line.split('|')] rows.append(cells) else: # Process CSV table csv_content = '\n'.join(table_lines) csv_reader = csv.reader(io.StringIO(csv_content)) rows = list(csv_reader) if rows and len(rows[0]) > 0: # Create Word table table = doc.add_table(rows=len(rows), cols=len(rows[0])) table.style = 'Table Grid' # Populate table for row_idx, row_data in enumerate(rows): for col_idx, cell_data in enumerate(row_data): if col_idx < len(table.rows[row_idx].cells): table.rows[row_idx].cells[col_idx].text = cell_data.strip() # Make header row bold if row_idx == 0: for cell in table.rows[row_idx].cells: for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True # Add placeholder to mark where table was inserted processed_lines.append(f"[TABLE_INSERTED_{len(processed_lines)}]") # Skip the table lines i = j continue except Exception as e: # If table parsing fails, treat as regular text pass processed_lines.append(line) i += 1 return '\n'.join(processed_lines) def _parseAndFormatContent(self, doc, content: str, title: str): """Parse AI-generated content in standardized format and apply proper DOCX formatting.""" if not content: return # Process tables and replace them with placeholders content = self._processTables(doc, content) # Parse content line by line in exact sequence lines = content.split('\n') for line in lines: line = line.strip() if not line: # Empty line - add paragraph break doc.add_paragraph() continue # Skip table placeholders (already processed) if line.startswith('[TABLE_INSERTED_'): continue # Check if this is a Markdown heading (# ## ###) if line.startswith('#'): level = len(line) - len(line.lstrip('#')) heading_text = line.lstrip('# ').strip() doc.add_heading(heading_text, level=min(level, 3)) # Check if this is a numbered heading (1) Title, 2) Title, etc.) elif re.match(r'^\d+\)\s+.+', line): heading_text = re.sub(r'^\d+\)\s+', '', line) doc.add_heading(heading_text, level=1) # Check if this is a Markdown list item elif line.startswith('- ') or re.match(r'^\d+\.\s+', line): bullet_text = re.sub(r'^[-•]\s+|\d+\.\s+', '', line) self._add_bullet_point(doc, bullet_text) # Check if this is a code block elif line.startswith('```'): if not line.endswith('```'): # Start of code block - collect until end code_lines = [line] continue else: # End of code block if 'code_lines' in locals(): code_lines.append(line) code_text = '\n'.join(code_lines) para = doc.add_paragraph() run = para.add_run(code_text) run.font.name = 'Courier New' del code_lines # Regular paragraph else: self._addParagraphToDoc(doc, line) def _addParagraphToDoc(self, doc, text: str): """Add a paragraph to the document with proper formatting.""" if not text.strip(): return # Check for Markdown formatting (**bold**, *italic*) para = doc.add_paragraph() # Split by bold markers parts = text.split('**') for i, part in enumerate(parts): if i % 2 == 0: # Regular text - check for italic italic_parts = part.split('*') for j, italic_part in enumerate(italic_parts): if j % 2 == 0: # Regular text if italic_part: para.add_run(italic_part) else: # Italic text if italic_part: run = para.add_run(italic_part) run.italic = True else: # Bold text if part: run = para.add_run(part) run.bold = True