""" DOCX renderer for report generation using python-docx. """ from .rendererBaseTemplate import BaseRenderer from typing import Dict, Any, Tuple, List import io import base64 import re import os from datetime import datetime, UTC try: from docx import Document from docx.shared import Inches, Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT from docx.oxml.shared import OxmlElement, qn from docx.oxml.ns import nsdecls from docx.oxml import parse_xml DOCX_AVAILABLE = True except ImportError: DOCX_AVAILABLE = False class RendererDocx(BaseRenderer): """Renders content to DOCX format using python-docx.""" @classmethod def get_supported_formats(cls) -> List[str]: """Return supported DOCX formats.""" return ['docx', 'doc'] @classmethod def get_format_aliases(cls) -> List[str]: """Return format aliases.""" return ['word', 'document'] @classmethod def get_priority(cls) -> int: """Return priority for DOCX renderer.""" return 115 async def render(self, extracted_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> Tuple[str, str]: """Render extracted JSON content to DOCX format using AI-analyzed styling.""" print(f"🔍 DOCX RENDER CALLED: title={title}, user_prompt={user_prompt[:50] if user_prompt else 'None'}...") try: if not DOCX_AVAILABLE: # Fallback to HTML if python-docx not available from .rendererHtml import RendererHtml html_renderer = RendererHtml() html_content, _ = await html_renderer.render(extracted_content, title) return html_content, "text/html" # Generate DOCX using AI-analyzed styling docx_content = await self._generate_docx_from_json(extracted_content, title, user_prompt, ai_service) return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" except Exception as e: self.logger.error(f"Error rendering DOCX: {str(e)}") # Return minimal fallback return f"DOCX Generation Error: {str(e)}", "text/plain" async def _generate_docx_from_json(self, json_content: Dict[str, Any], title: str, user_prompt: str = None, ai_service=None) -> str: """Generate DOCX content from structured JSON document using AI-generated styling.""" try: # Create new document doc = Document() # Get AI-generated styling definitions print(f"🔍 ABOUT TO CALL AI STYLING: user_prompt={user_prompt[:50] if user_prompt else 'None'}...") self.logger.info(f"About to call AI styling with user_prompt: {user_prompt[:100] if user_prompt else 'None'}...") styles = await self._get_docx_styles(user_prompt, ai_service) print(f"🔍 AI STYLING RESULT: {type(styles)}") # Apply basic document setup self._setup_basic_document_styles(doc) # Validate JSON structure if not isinstance(json_content, dict): raise ValueError("JSON content must be a dictionary") if "sections" not in json_content: raise ValueError("JSON content must contain 'sections' field") # Use title from JSON metadata if available, otherwise use provided title document_title = json_content.get("metadata", {}).get("title", title) # Add document title using analyzed styles if document_title: title_heading = doc.add_heading(document_title, level=1) title_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER # Process each section in order sections = json_content.get("sections", []) for section in sections: self._render_json_section(doc, section, styles) # Save to buffer buffer = io.BytesIO() doc.save(buffer) buffer.seek(0) # Convert to base64 docx_bytes = buffer.getvalue() docx_base64 = base64.b64encode(docx_bytes).decode('utf-8') return docx_base64 except Exception as e: self.logger.error(f"Error generating DOCX from JSON: {str(e)}") raise Exception(f"DOCX generation failed: {str(e)}") async def _get_docx_styles(self, user_prompt: str, ai_service=None) -> Dict[str, Any]: """Get DOCX styling definitions using base template AI styling.""" style_schema = { "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"}, "heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"}, "heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"}, "paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"}, "table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center"}, "table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left"}, "table_border": {"style": "horizontal_only", "color": "#000000", "thickness": "thin"}, "bullet_list": {"font_size": 11, "color": "#2F2F2F", "indent": 20}, "code_block": {"font": "Courier New", "font_size": 10, "color": "#2F2F2F", "background": "#F5F5F5"} } style_template = self._create_ai_style_template("docx", user_prompt, style_schema) styles = await self._get_ai_styles(ai_service, style_template, self._get_default_styles()) # Validate and fix contrast issues return self._validate_styles_contrast(styles) def _validate_styles_contrast(self, styles: Dict[str, Any]) -> Dict[str, Any]: """Validate and fix contrast issues in AI-generated styles.""" try: # Fix table header contrast if "table_header" in styles: header = styles["table_header"] bg_color = header.get("background", "#FFFFFF") text_color = header.get("text_color", "#000000") # If both are white or both are dark, fix it if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF": header["background"] = "#4F4F4F" header["text_color"] = "#FFFFFF" elif bg_color.upper() == "#000000" and text_color.upper() == "#000000": header["background"] = "#4F4F4F" header["text_color"] = "#FFFFFF" # Fix table cell contrast if "table_cell" in styles: cell = styles["table_cell"] bg_color = cell.get("background", "#FFFFFF") text_color = cell.get("text_color", "#000000") # If both are white or both are dark, fix it if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF": cell["background"] = "#FFFFFF" cell["text_color"] = "#2F2F2F" elif bg_color.upper() == "#000000" and text_color.upper() == "#000000": cell["background"] = "#FFFFFF" cell["text_color"] = "#2F2F2F" return styles except Exception as e: self.logger.warning(f"Style validation failed: {str(e)}") return self._get_default_styles() def _get_default_styles(self) -> Dict[str, Any]: """Default DOCX styles.""" return { "title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"}, "heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"}, "heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"}, "paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"}, "table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center"}, "table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left"}, "table_border": {"style": "horizontal_only", "color": "#000000", "thickness": "thin"}, "bullet_list": {"font_size": 11, "color": "#2F2F2F", "indent": 20}, "code_block": {"font": "Courier New", "font_size": 10, "color": "#2F2F2F", "background": "#F5F5F5"} } def _setup_basic_document_styles(self, doc: Document) -> None: """Set up basic document styles.""" try: # Set default font style = doc.styles['Normal'] font = style.font font.name = 'Calibri' font.size = Pt(11) except Exception as e: self.logger.warning(f"Could not set up basic document styles: {str(e)}") def _clear_template_content(self, doc: Document) -> None: """Clear template content while preserving styles.""" try: # Remove all paragraphs except keep the styles for paragraph in list(doc.paragraphs): # Keep the paragraph but clear its content paragraph.clear() # Remove all tables for table in list(doc.tables): table._element.getparent().remove(table._element) except Exception as e: self.logger.warning(f"Could not clear template content: {str(e)}") def _render_json_section(self, doc: Document, section: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a single JSON section to DOCX using AI-generated styles.""" try: section_type = section.get("type", "paragraph") section_data = section.get("data", {}) if section_type == "table": self._render_json_table(doc, section_data, styles) elif section_type == "bullet_list": self._render_json_bullet_list(doc, section_data, styles) elif section_type == "heading": self._render_json_heading(doc, section_data, styles) elif section_type == "paragraph": self._render_json_paragraph(doc, section_data, styles) elif section_type == "code_block": self._render_json_code_block(doc, section_data, styles) elif section_type == "image": self._render_json_image(doc, section_data, styles) else: # Fallback to paragraph for unknown types self._render_json_paragraph(doc, section_data, styles) except Exception as e: self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}") # Add error paragraph as fallback error_para = doc.add_paragraph(f"[Error rendering section: {str(e)}]") def _render_json_table(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON table to DOCX using AI-generated styles.""" try: headers = table_data.get("headers", []) rows = table_data.get("rows", []) if not headers or not rows: return # Create table table = doc.add_table(rows=len(rows) + 1, cols=len(headers)) table.alignment = WD_TABLE_ALIGNMENT.CENTER # Apply table borders based on AI style border_style = styles["table_border"]["style"] if border_style == "horizontal_only": self._apply_horizontal_borders_only(table) elif border_style == "grid": table.style = 'Table Grid' # else: no borders # Add headers with AI-generated styling header_row = table.rows[0] header_style = styles["table_header"] for i, header in enumerate(headers): if i < len(header_row.cells): cell = header_row.cells[i] cell.text = str(header) # Apply background color bg_color = header_style["background"].lstrip('#') self._set_cell_background(cell, RGBColor(int(bg_color[0:2], 16), int(bg_color[2:4], 16), int(bg_color[4:6], 16))) # Apply text styling for paragraph in cell.paragraphs: paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER if header_style["align"] == "center" else WD_ALIGN_PARAGRAPH.LEFT for run in paragraph.runs: run.bold = header_style["bold"] run.font.size = Pt(11) text_color = header_style["text_color"].lstrip('#') run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16)) # Add data rows with AI-generated styling cell_style = styles["table_cell"] for row_idx, row_data in enumerate(rows): if row_idx + 1 < len(table.rows): table_row = table.rows[row_idx + 1] for col_idx, cell_data in enumerate(row_data): if col_idx < len(table_row.cells): cell = table_row.cells[col_idx] cell.text = str(cell_data) # Apply text styling for paragraph in cell.paragraphs: paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT for run in paragraph.runs: run.font.size = Pt(10) text_color = cell_style["text_color"].lstrip('#') run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16)) except Exception as e: self.logger.warning(f"Error rendering table: {str(e)}") def _apply_horizontal_borders_only(self, table) -> None: """Apply only horizontal borders to the table (no vertical borders).""" try: from docx.oxml.shared import OxmlElement, qn # Get table properties tbl_pr = table._element.find(qn('w:tblPr')) if tbl_pr is None: tbl_pr = OxmlElement('w:tblPr') table._element.insert(0, tbl_pr) # Remove existing borders existing_borders = tbl_pr.find(qn('w:tblBorders')) if existing_borders is not None: tbl_pr.remove(existing_borders) # Create new borders element tbl_borders = OxmlElement('w:tblBorders') # Top border top_border = OxmlElement('w:top') top_border.set(qn('w:val'), 'single') top_border.set(qn('w:sz'), '4') top_border.set(qn('w:space'), '0') top_border.set(qn('w:color'), '000000') tbl_borders.append(top_border) # Bottom border bottom_border = OxmlElement('w:bottom') bottom_border.set(qn('w:val'), 'single') bottom_border.set(qn('w:sz'), '4') bottom_border.set(qn('w:space'), '0') bottom_border.set(qn('w:color'), '000000') tbl_borders.append(bottom_border) # Left border - none left_border = OxmlElement('w:left') left_border.set(qn('w:val'), 'none') tbl_borders.append(left_border) # Right border - none right_border = OxmlElement('w:right') right_border.set(qn('w:val'), 'none') tbl_borders.append(right_border) # Inside horizontal border inside_h_border = OxmlElement('w:insideH') inside_h_border.set(qn('w:val'), 'single') inside_h_border.set(qn('w:sz'), '4') inside_h_border.set(qn('w:space'), '0') inside_h_border.set(qn('w:color'), '000000') tbl_borders.append(inside_h_border) # Inside vertical border - none inside_v_border = OxmlElement('w:insideV') inside_v_border.set(qn('w:val'), 'none') tbl_borders.append(inside_v_border) tbl_pr.append(tbl_borders) except Exception as e: self.logger.warning(f"Could not apply horizontal borders: {str(e)}") def _set_cell_background(self, cell, color: RGBColor) -> None: """Set the background color of a table cell.""" try: from docx.oxml.shared import OxmlElement, qn # Get cell properties tc_pr = cell._element.find(qn('w:tcPr')) if tc_pr is None: tc_pr = OxmlElement('w:tcPr') cell._element.insert(0, tc_pr) # Remove existing shading existing_shading = tc_pr.find(qn('w:shd')) if existing_shading is not None: tc_pr.remove(existing_shading) # Create new shading element shading = OxmlElement('w:shd') shading.set(qn('w:val'), 'clear') shading.set(qn('w:color'), 'auto') # Convert RGBColor to hex string by unpacking RGB components red, green, blue = color hex_color = f"{red:02x}{green:02x}{blue:02x}" shading.set(qn('w:fill'), hex_color) tc_pr.append(shading) except Exception as e: self.logger.warning(f"Could not set cell background: {str(e)}") def _render_json_bullet_list(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON bullet list to DOCX using AI-generated styles.""" try: items = list_data.get("items", []) bullet_style = styles["bullet_list"] for item in items: if isinstance(item, str): para = doc.add_paragraph(item, style='List Bullet') elif isinstance(item, dict) and "text" in item: para = doc.add_paragraph(item["text"], style='List Bullet') except Exception as e: self.logger.warning(f"Error rendering bullet list: {str(e)}") def _render_json_heading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON heading to DOCX using AI-generated styles.""" try: level = heading_data.get("level", 1) text = heading_data.get("text", "") if text: level = max(1, min(6, level)) doc.add_heading(text, level=level) except Exception as e: self.logger.warning(f"Error rendering heading: {str(e)}") def _render_json_paragraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON paragraph to DOCX using AI-generated styles.""" try: text = paragraph_data.get("text", "") if text: para = doc.add_paragraph(text) except Exception as e: self.logger.warning(f"Error rendering paragraph: {str(e)}") def _render_json_code_block(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON code block to DOCX using AI-generated styles.""" try: code = code_data.get("code", "") language = code_data.get("language", "") if code: if language: lang_para = doc.add_paragraph(f"Code ({language}):") lang_para.runs[0].bold = True code_para = doc.add_paragraph(code) for run in code_para.runs: run.font.name = 'Courier New' run.font.size = Pt(10) except Exception as e: self.logger.warning(f"Error rendering code block: {str(e)}") def _render_json_image(self, doc: Document, image_data: Dict[str, Any], styles: Dict[str, Any]) -> None: """Render a JSON image to DOCX.""" try: base64_data = image_data.get("base64Data", "") alt_text = image_data.get("altText", "Image") if base64_data: image_bytes = base64.b64decode(base64_data) doc.add_picture(io.BytesIO(image_bytes), width=Inches(4)) if alt_text: caption_para = doc.add_paragraph(f"Figure: {alt_text}") caption_para.runs[0].italic = True except Exception as e: self.logger.warning(f"Error rendering image: {str(e)}") doc.add_paragraph(f"[Image: {image_data.get('altText', 'Image')}]") def _extract_structure_from_prompt(self, user_prompt: str, title: str) -> Dict[str, Any]: """Extract document structure from user prompt.""" structure = { 'title': title, 'sections': [], 'format': 'standard' } if not user_prompt: return structure # Extract title from prompt if not provided if not title or title == "Generated Document": # Look for "create a ... document" or "generate a ... report" import re title_match = re.search(r'(?:create|generate|make)\s+a\s+([^,]+?)(?:\s+document|\s+report|\s+summary)', user_prompt.lower()) if title_match: structure['title'] = title_match.group(1).strip().title() # Extract sections from numbered lists in prompt import re section_pattern = r'(\d+)\)?\s*([^,]+?)(?:\s*[,:]|\s*$)' sections = re.findall(section_pattern, user_prompt) for num, section_text in sections: structure['sections'].append({ 'number': int(num), 'title': section_text.strip(), 'level': 2 # H2 level }) # If no numbered sections found, try to extract from "including:" patterns if not structure['sections']: including_match = re.search(r'including:\s*(.+?)(?:\.|$)', user_prompt, re.DOTALL) if including_match: including_text = including_match.group(1) # Split by common separators parts = re.split(r'[,;]\s*', including_text) for i, part in enumerate(parts, 1): part = part.strip() if part: structure['sections'].append({ 'number': i, 'title': part, 'level': 2 }) # If still no sections, extract from any list-like patterns if not structure['sections']: # Look for bullet points or dashes bullet_pattern = r'[-•]\s*([^,\n]+?)(?:\s*[,:]|\s*$)' bullets = re.findall(bullet_pattern, user_prompt) for i, bullet in enumerate(bullets, 1): bullet = bullet.strip() if bullet and len(bullet) > 3: structure['sections'].append({ 'number': i, 'title': bullet, 'level': 2 }) # If still no sections, extract from sentence structure if not structure['sections']: # Split prompt into sentences and use as sections sentences = re.split(r'[.!?]\s+', user_prompt) for i, sentence in enumerate(sentences[:5], 1): # Max 5 sections sentence = sentence.strip() if sentence and len(sentence) > 10 and not sentence.startswith(('Analyze', 'Create', 'Generate')): structure['sections'].append({ 'number': i, 'title': sentence[:50] + "..." if len(sentence) > 50 else sentence, 'level': 2 }) # Final fallback: create sections from prompt keywords if not structure['sections']: # Extract key action words from prompt action_words = ['analyze', 'summarize', 'review', 'assess', 'evaluate', 'examine', 'investigate'] found_actions = [] for action in action_words: if action in user_prompt.lower(): found_actions.append(action.title()) if found_actions: for i, action in enumerate(found_actions[:3], 1): structure['sections'].append({ 'number': i, 'title': f"{action} Document Content", 'level': 2 }) else: # Last resort: generic but meaningful sections structure['sections'] = [ {'number': 1, 'title': 'Document Analysis', 'level': 2}, {'number': 2, 'title': 'Key Information', 'level': 2}, {'number': 3, 'title': 'Summary and Conclusions', 'level': 2} ] return structure def _generate_content_from_structure(self, doc, content: str, structure: Dict[str, Any]): """Generate DOCX content based on extracted structure.""" # Add sections based on prompt structure for section in structure['sections']: # Add section heading doc.add_heading(f"{section['number']}) {section['title']}", level=section['level']) # Add AI-generated content for this section # Try to extract relevant content for this section from the AI response section_content = self._extract_section_content(content, section['title']) if section_content: doc.add_paragraph(section_content) else: # If no specific content found, add a note doc.add_paragraph(f"Content for {section['title']} based on document analysis.") # Add some spacing doc.add_paragraph() # Add the complete AI-generated content as additional analysis if content and content.strip(): doc.add_heading("Complete Analysis", level=1) doc.add_paragraph(content) def _extract_section_content(self, content: str, section_title: str) -> str: """Extract relevant content for a specific section from AI response.""" if not content or not section_title: return "" # Look for content that matches the section title section_keywords = section_title.lower().split() # Split content into paragraphs paragraphs = content.split('\n\n') relevant_paragraphs = [] for paragraph in paragraphs: paragraph_lower = paragraph.lower() # Check if paragraph contains keywords from section title if any(keyword in paragraph_lower for keyword in section_keywords if len(keyword) > 3): relevant_paragraphs.append(paragraph.strip()) if relevant_paragraphs: return '\n\n'.join(relevant_paragraphs[:2]) # Max 2 paragraphs per section return "" def _setup_document_styles(self, doc): """Set up document styles.""" try: # Set default font style = doc.styles['Normal'] font = style.font font.name = 'Calibri' font.size = Pt(11) # Set heading styles for i in range(1, 4): heading_style = doc.styles[f'Heading {i}'] heading_font = heading_style.font heading_font.name = 'Calibri' heading_font.size = Pt(16 - i * 2) heading_font.bold = True except Exception as e: self.logger.warning(f"Could not set up document styles: {str(e)}") def _process_section(self, doc, lines: list): """Process a section of content into DOCX elements.""" for line in lines: if not line.strip(): continue # Check for tables (lines with |) if '|' in line and not line.startswith('|'): # This might be part of a table, process as table table_data = self._extract_table_data(lines) if table_data: self._add_table(doc, table_data) return # Check for lists if line.startswith('- ') or line.startswith('* '): # This is a list item doc.add_paragraph(line[2:], style='List Bullet') elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')): # This is a numbered list item doc.add_paragraph(line[3:], style='List Number') else: # Regular paragraph doc.add_paragraph(line) def _extract_table_data(self, lines: list) -> list: """Extract table data from lines.""" table_data = [] in_table = False for line in lines: if '|' in line: if not in_table: in_table = True # Split by | and clean up cells = [cell.strip() for cell in line.split('|') if cell.strip()] if cells: table_data.append(cells) elif in_table and not line.strip(): # Empty line, might be end of table break return table_data if len(table_data) > 1 else [] def _add_table(self, doc, table_data: list): """Add a table to the document.""" try: if not table_data: return # Create table table = doc.add_table(rows=len(table_data), cols=len(table_data[0])) table.alignment = WD_TABLE_ALIGNMENT.CENTER # Add data to table for row_idx, row_data in enumerate(table_data): for col_idx, cell_data in enumerate(row_data): if col_idx < len(table.rows[row_idx].cells): table.rows[row_idx].cells[col_idx].text = cell_data # Style the table self._style_table(table) except Exception as e: self.logger.warning(f"Could not add table: {str(e)}") def _style_table(self, table): """Apply styling to the table.""" try: # Style header row if len(table.rows) > 0: header_cells = table.rows[0].cells for cell in header_cells: for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True except Exception as e: self.logger.warning(f"Could not style table: {str(e)}") def _process_table_row(self, doc, line: str): """Process a table row and add it to the document.""" if not line.strip(): return # Split by pipe separator parts = [part.strip() for part in line.split('|')] if len(parts) >= 2: # This is a table row - create a table if it doesn't exist if not hasattr(self, '_current_table') or self._current_table is None: # Create new table self._current_table = doc.add_table(rows=1, cols=len(parts)) self._current_table.style = 'Table Grid' # Add header row for i, part in enumerate(parts): if i < len(self._current_table.rows[0].cells): cell = self._current_table.rows[0].cells[i] cell.text = part # Make header bold for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True else: # Add data row to existing table row = self._current_table.add_row() for i, part in enumerate(parts): if i < len(row.cells): row.cells[i].text = part else: # Not a table row, treat as regular text doc.add_paragraph(line) def _clean_ai_content(self, content: str) -> str: """Clean AI-generated content by removing debug information and duplicates.""" if not content: return "" # Remove debug information lines = content.split('\n') clean_lines = [] for line in lines: # Skip debug lines and separators if (line.startswith('[Skipped ') or line.startswith('=== DOCUMENT:') or line.startswith('---') or line.startswith('FILENAME:') or line.strip() == '' or line.strip() == '---'): continue clean_lines.append(line) # Join lines and remove duplicate content clean_content = '\n'.join(clean_lines) # Remove duplicate sections by keeping only the first occurrence sections = clean_content.split('\n\n') seen_sections = set() unique_sections = [] for section in sections: section_key = section.strip()[:50] # Use first 50 chars as key if section_key not in seen_sections and section.strip(): seen_sections.add(section_key) unique_sections.append(section) return '\n\n'.join(unique_sections) def _process_tables(self, doc, content: str) -> str: """ Process tables in the content (both CSV and pipe-separated) and convert them to Word tables. Returns the content with tables replaced by placeholders. """ import csv import io lines = content.split('\n') processed_lines = [] i = 0 while i < len(lines): line = lines[i].strip() # Check if this line looks like a table (contains pipes or commas with multiple fields) is_pipe_table = '|' in line and len(line.split('|')) >= 2 is_csv_table = ',' in line and len(line.split(',')) >= 2 if is_pipe_table or is_csv_table: # Collect consecutive table lines table_lines = [] j = i # Determine separator and collect lines separator = '|' if is_pipe_table else ',' while j < len(lines): current_line = lines[j].strip() if separator in current_line and len(current_line.split(separator)) >= 2: table_lines.append(current_line) j += 1 else: break if len(table_lines) >= 2: # At least header + 1 data row # Create Word table try: if separator == '|': # Process pipe-separated table rows = [] for table_line in table_lines: # Split by pipe and clean up cells = [cell.strip() for cell in table_line.split('|')] rows.append(cells) else: # Process CSV table csv_content = '\n'.join(table_lines) csv_reader = csv.reader(io.StringIO(csv_content)) rows = list(csv_reader) if rows and len(rows[0]) > 0: # Create Word table table = doc.add_table(rows=len(rows), cols=len(rows[0])) table.style = 'Table Grid' # Populate table for row_idx, row_data in enumerate(rows): for col_idx, cell_data in enumerate(row_data): if col_idx < len(table.rows[row_idx].cells): table.rows[row_idx].cells[col_idx].text = cell_data.strip() # Make header row bold if row_idx == 0: for cell in table.rows[row_idx].cells: for paragraph in cell.paragraphs: for run in paragraph.runs: run.bold = True # Add placeholder to mark where table was inserted processed_lines.append(f"[TABLE_INSERTED_{len(processed_lines)}]") # Skip the table lines i = j continue except Exception as e: # If table parsing fails, treat as regular text pass processed_lines.append(line) i += 1 return '\n'.join(processed_lines) def _parse_and_format_content(self, doc, content: str, title: str): """Parse AI-generated content in standardized format and apply proper DOCX formatting.""" if not content: return # Process tables and replace them with placeholders content = self._process_tables(doc, content) # Parse content line by line in exact sequence lines = content.split('\n') for line in lines: line = line.strip() if not line: # Empty line - add paragraph break doc.add_paragraph() continue # Skip table placeholders (already processed) if line.startswith('[TABLE_INSERTED_'): continue # Check if this is a Markdown heading (# ## ###) if line.startswith('#'): level = len(line) - len(line.lstrip('#')) heading_text = line.lstrip('# ').strip() doc.add_heading(heading_text, level=min(level, 3)) # Check if this is a numbered heading (1) Title, 2) Title, etc.) elif re.match(r'^\d+\)\s+.+', line): heading_text = re.sub(r'^\d+\)\s+', '', line) doc.add_heading(heading_text, level=1) # Check if this is a Markdown list item elif line.startswith('- ') or re.match(r'^\d+\.\s+', line): bullet_text = re.sub(r'^[-•]\s+|\d+\.\s+', '', line) self._add_bullet_point(doc, bullet_text) # Check if this is a code block elif line.startswith('```'): if not line.endswith('```'): # Start of code block - collect until end code_lines = [line] continue else: # End of code block if 'code_lines' in locals(): code_lines.append(line) code_text = '\n'.join(code_lines) para = doc.add_paragraph() run = para.add_run(code_text) run.font.name = 'Courier New' del code_lines # Regular paragraph else: self._add_paragraph_to_doc(doc, line) def _add_paragraph_to_doc(self, doc, text: str): """Add a paragraph to the document with proper formatting.""" if not text.strip(): return # Check for Markdown formatting (**bold**, *italic*) para = doc.add_paragraph() # Split by bold markers parts = text.split('**') for i, part in enumerate(parts): if i % 2 == 0: # Regular text - check for italic italic_parts = part.split('*') for j, italic_part in enumerate(italic_parts): if j % 2 == 0: # Regular text if italic_part: para.add_run(italic_part) else: # Italic text if italic_part: run = para.add_run(italic_part) run.italic = True else: # Bold text if part: run = para.add_run(part) run.bold = True