gateway/modules/services/serviceGeneration/renderers/rendererDocx.py
2026-01-23 01:10:00 +01:00

1609 lines
No EOL
72 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
DOCX renderer for report generation using python-docx.
"""
from .documentRendererBaseTemplate import BaseRenderer
from modules.datamodels.datamodelDocument import RenderedDocument
from typing import Dict, Any, List, Optional
import io
import base64
import re
import csv
try:
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
DOCX_AVAILABLE = True
except ImportError:
DOCX_AVAILABLE = False
class RendererDocx(BaseRenderer):
"""Renders content to DOCX format using python-docx."""
@classmethod
def getSupportedFormats(cls) -> List[str]:
"""Return supported DOCX formats."""
return ['docx', 'doc']
@classmethod
def getFormatAliases(cls) -> List[str]:
"""Return format aliases."""
return ['word', 'document']
@classmethod
def getPriority(cls) -> int:
"""Return priority for DOCX renderer."""
return 115
@classmethod
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
"""Return output style classification: Word documents are formatted documents."""
return 'document'
@classmethod
def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]:
"""
Return list of section content types that DOCX renderer accepts.
DOCX renderer accepts all section types (Word documents can contain all content types).
"""
from modules.datamodels.datamodelJson import supportedSectionTypes
return list(supportedSectionTypes)
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
"""Render extracted JSON content to DOCX format using AI-analyzed styling."""
self.services.utils.debugLogToFile(f"DOCX RENDER CALLED: title={title}, user_prompt={userPrompt[:50] if userPrompt else 'None'}...", "DOCX_RENDERER")
try:
if not DOCX_AVAILABLE:
# Fallback to HTML if python-docx not available
from .rendererHtml import RendererHtml
htmlRenderer = RendererHtml()
return await htmlRenderer.render(extractedContent, title, userPrompt, aiService)
# Generate DOCX using AI-analyzed styling
docx_content = await self._generateDocxFromJson(extractedContent, title, userPrompt, aiService)
# Extract metadata for document type and other info
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
# Determine filename from document or title
documents = extractedContent.get("documents", [])
if documents and isinstance(documents[0], dict):
filename = documents[0].get("filename")
if not filename:
filename = self._determineFilename(title, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
else:
filename = self._determineFilename(title, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
# Convert DOCX content to bytes if it's a string (base64)
if isinstance(docx_content, str):
try:
docx_bytes = base64.b64decode(docx_content)
except Exception:
docx_bytes = docx_content.encode('utf-8')
else:
docx_bytes = docx_content
return [
RenderedDocument(
documentData=docx_bytes,
mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
filename=filename,
documentType=documentType,
metadata=metadata if isinstance(metadata, dict) else None
)
]
except Exception as e:
self.logger.error(f"Error rendering DOCX: {str(e)}")
# Return minimal fallback
fallbackContent = f"DOCX Generation Error: {str(e)}"
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
return [
RenderedDocument(
documentData=fallbackContent.encode('utf-8'),
mimeType="text/plain",
filename=self._determineFilename(title, "text/plain"),
documentType=documentType,
metadata=metadata if isinstance(metadata, dict) else None
)
]
async def _generateDocxFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
"""Generate DOCX content from structured JSON document."""
import time
start_time = time.time()
try:
self.logger.debug("_generateDocxFromJson: Starting document generation")
# Create new document
doc = Document()
self.logger.debug(f"_generateDocxFromJson: Document created in {time.time() - start_time:.2f}s")
# Get style set: use styles from metadata if available, otherwise enhance with AI
style_start = time.time()
self.logger.debug("_generateDocxFromJson: About to get style set")
styleSet = await self._getStyleSet(json_content, userPrompt, aiService)
self.logger.debug(f"_generateDocxFromJson: Style set retrieved in {time.time() - style_start:.2f}s")
# Setup basic document styles and create all styles from style set
setup_start = time.time()
self.logger.debug("_generateDocxFromJson: Setting up document styles")
self._setupBasicDocumentStyles(doc)
self._setupDocumentStyles(doc, styleSet)
self.logger.debug(f"_generateDocxFromJson: Document styles setup in {time.time() - setup_start:.2f}s")
# Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]})
if not self._validateJsonStructure(json_content):
raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}")
# Extract sections and metadata from standardized schema
extract_start = time.time()
self.logger.debug("_generateDocxFromJson: Extracting sections and metadata")
sections = self._extractSections(json_content)
metadata = self._extractMetadata(json_content)
self.logger.debug(f"_generateDocxFromJson: Extracted {len(sections)} sections in {time.time() - extract_start:.2f}s")
# Use provided title (which comes from documents[].title) as primary source
# Fallback to metadata.title only if title parameter is empty
document_title = title if title else metadata.get("title", "Generated Document")
# Add document title using Title style
if document_title:
doc.add_paragraph(document_title, style='Title')
# Process each section in order
render_start = time.time()
self.logger.debug(f"_generateDocxFromJson: Starting to render {len(sections)} sections")
for idx, section in enumerate(sections):
section_start = time.time()
self.logger.debug(f"_generateDocxFromJson: Rendering section {idx + 1}/{len(sections)}")
self._renderJsonSection(doc, section, styleSet)
self.logger.debug(f"_generateDocxFromJson: Section {idx + 1} rendered in {time.time() - section_start:.2f}s")
self.logger.debug(f"_generateDocxFromJson: All sections rendered in {time.time() - render_start:.2f}s")
# Save to buffer
save_start = time.time()
self.logger.debug("_generateDocxFromJson: Starting to save document to buffer")
buffer = io.BytesIO()
doc.save(buffer)
buffer.seek(0)
self.logger.debug(f"_generateDocxFromJson: Document saved to buffer in {time.time() - save_start:.2f}s")
# Convert to base64
encode_start = time.time()
self.logger.debug("_generateDocxFromJson: Converting to base64")
docx_bytes = buffer.getvalue()
docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')
self.logger.debug(f"_generateDocxFromJson: Converted to base64 in {time.time() - encode_start:.2f}s (document size: {len(docx_bytes)} bytes)")
total_time = time.time() - start_time
self.logger.info(f"_generateDocxFromJson: Document generation completed in {total_time:.2f}s")
return docx_base64
except Exception as e:
self.logger.error(f"Error generating DOCX from JSON: {str(e)}")
raise Exception(f"DOCX generation failed: {str(e)}")
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
"""Get style set - use styles from document generation metadata if available,
otherwise enhance default styles with AI if userPrompt provided.
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
not be generated separately by renderers. Only fall back to AI if styles not provided.
Args:
extractedContent: Document content with metadata (may contain styles)
userPrompt: User's prompt (AI will detect style instructions in any language)
aiService: AI service (used only if styles not in metadata and userPrompt provided)
templateName: Name of template style set (None = default)
Returns:
Dict with style definitions for all document styles
"""
# Get default style set
if templateName == "corporate":
defaultStyleSet = self._getCorporateStyleSet()
elif templateName == "minimal":
defaultStyleSet = self._getMinimalStyleSet()
else:
defaultStyleSet = self._getDefaultStyleSet()
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
if extractedContent:
metadata = extractedContent.get("metadata", {})
if isinstance(metadata, dict):
styles = metadata.get("styles")
if styles and isinstance(styles, dict):
self.logger.debug("Using styles from document generation metadata")
return self._validateStylesContrast(styles)
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
if userPrompt and aiService:
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
return self._validateStylesContrast(enhancedStyleSet)
else:
# Use default styles only
return defaultStyleSet
async def _enhanceStylesWithAI(self, userPrompt: str, defaultStyleSet: Dict[str, Any], aiService) -> Dict[str, Any]:
"""Enhance default styles with AI based on user prompt."""
try:
style_template = self._createAiStyleTemplate("docx", userPrompt, defaultStyleSet)
enhanced_styles = await self._getAiStyles(aiService, style_template, defaultStyleSet)
return enhanced_styles
except Exception as e:
self.logger.warning(f"AI style enhancement failed: {str(e)}, using default styles")
return defaultStyleSet
def _validateStylesContrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
"""Validate and fix contrast issues in AI-generated styles."""
try:
# Fix table header contrast
if "table_header" in styles:
header = styles["table_header"]
bg_color = header.get("background", "#FFFFFF")
text_color = header.get("text_color", "#000000")
# If both are white or both are dark, fix it
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
header["background"] = "#4F4F4F"
header["text_color"] = "#FFFFFF"
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
header["background"] = "#4F4F4F"
header["text_color"] = "#FFFFFF"
# Fix table cell contrast
if "table_cell" in styles:
cell = styles["table_cell"]
bg_color = cell.get("background", "#FFFFFF")
text_color = cell.get("text_color", "#000000")
# If both are white or both are dark, fix it
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
cell["background"] = "#FFFFFF"
cell["text_color"] = "#2F2F2F"
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
cell["background"] = "#FFFFFF"
cell["text_color"] = "#2F2F2F"
return styles
except Exception as e:
self.logger.warning(f"Style validation failed: {str(e)}")
return self._getDefaultStyleSet()
def _getDefaultStyleSet(self) -> Dict[str, Any]:
"""Default DOCX style set - used when no style instructions present."""
return {
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "center"},
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"},
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"},
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"},
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center"},
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left"},
"table_border": {"style": "horizontal_only", "color": "#000000", "thickness": "thin"},
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "indent": 20},
"code_block": {"font": "Courier New", "font_size": 10, "color": "#2F2F2F", "background": "#F5F5F5"}
}
def _setupBasicDocumentStyles(self, doc: Document) -> None:
"""Set up basic document styles."""
try:
# Set default font
style = doc.styles['Normal']
font = style.font
font.name = 'Calibri'
font.size = Pt(11)
except Exception as e:
self.logger.warning(f"Could not set up basic document styles: {str(e)}")
def _clearTemplateContent(self, doc: Document) -> None:
"""Clear template content while preserving styles."""
try:
# Remove all paragraphs except keep the styles
for paragraph in list(doc.paragraphs):
# Keep the paragraph but clear its content
paragraph.clear()
# Remove all tables
for table in list(doc.tables):
table._element.getparent().remove(table._element)
except Exception as e:
self.logger.warning(f"Could not clear template content: {str(e)}")
def _renderJsonSection(self, doc: Document, section: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a single JSON section to DOCX using AI-generated styles.
Supports three content formats: reference, object (base64), extracted_text.
"""
try:
section_type = section.get("content_type", "paragraph")
elements = section.get("elements", [])
# If no elements, skip this section (it has no content to render)
if not elements:
return
# Process each element in the section
for element in elements:
# Skip non-dict elements (e.g., int, str, etc.)
if not isinstance(element, dict):
continue
element_type = element.get("type", "")
# Support three content formats from Phase 5D
if element_type == "reference":
# Document reference format
doc_ref = element.get("documentReference", "")
label = element.get("label", "Reference")
para = doc.add_paragraph(f"[Reference: {label}]")
para.runs[0].italic = True
continue
elif element_type == "extracted_text":
# Extracted text format - render as paragraph
content = element.get("content", "")
source = element.get("source", "")
if content:
para = doc.add_paragraph(content)
if source:
para.add_run(f" (Source: {source})").italic = True
continue
# Check element type, not section type (elements can have different types than section)
if element_type == "table":
self._renderJsonTable(doc, element, styles)
elif element_type == "bullet_list":
self._renderJsonBulletList(doc, element, styles)
elif element_type == "heading":
self._renderJsonHeading(doc, element, styles)
elif element_type == "paragraph":
self._renderJsonParagraph(doc, element, styles)
elif element_type == "code_block":
self._renderJsonCodeBlock(doc, element, styles)
elif element_type == "image":
self._renderJsonImage(doc, element, styles)
else:
# Fallback: if element_type not set, use section_type
if section_type == "table":
self._renderJsonTable(doc, element, styles)
elif section_type == "bullet_list":
self._renderJsonBulletList(doc, element, styles)
elif section_type == "heading":
self._renderJsonHeading(doc, element, styles)
elif section_type == "paragraph":
# CRITICAL: Check if this is actually an image element before rendering as paragraph
# Image elements might not have type set, but have base64Data in content
content = element.get("content", {})
if isinstance(content, dict) and content.get("base64Data"):
# This is actually an image, render it as such
self._renderJsonImage(doc, element, styles)
else:
self._renderJsonParagraph(doc, element, styles)
elif section_type == "code_block":
self._renderJsonCodeBlock(doc, element, styles)
elif section_type == "image":
self._renderJsonImage(doc, element, styles)
else:
# Fallback to paragraph for unknown types, but check for image data first
content = element.get("content", {})
if isinstance(content, dict) and content.get("base64Data"):
# This is actually an image, render it as such
self._renderJsonImage(doc, element, styles)
else:
self._renderJsonParagraph(doc, element, styles)
except Exception as e:
self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")
# Add error paragraph as fallback
error_para = doc.add_paragraph(f"[Error rendering section: {str(e)}]")
def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""
Render a JSON table to DOCX using AI-generated styles.
PERFORMANCE OPTIMIZATION: Uses direct XML manipulation via lxml instead of
python-docx high-level API. This bypasses the slow cell.text assignment
which creates multiple XML operations per cell.
The key insight: python-docx's cell.text setter is slow because it:
1. Clears existing content (XML manipulation)
2. Creates a new paragraph element
3. Creates a new run element
4. Sets text value
By building the XML directly, we achieve 100-1000x faster performance.
"""
import time
table_start = time.time()
try:
# Extract from nested content structure
content = table_data.get("content", {})
if not isinstance(content, dict):
return
headers = content.get("headers", [])
rows = content.get("rows", [])
if not headers or not rows:
return
totalRows = len(rows)
totalCols = len(headers)
totalCells = totalRows * totalCols
self.logger.debug(f"_renderJsonTable: Starting FAST table render - {totalRows} rows x {totalCols} columns = {totalCells} cells")
# Use fast XML-based table rendering
self._renderTableFastXml(doc, headers, rows, styles)
total_time = time.time() - table_start
rate = totalCells / total_time if total_time > 0 else 0
self.logger.info(f"_renderJsonTable: Table completed in {total_time:.2f}s ({totalRows} rows x {totalCols} cols = {totalCells} cells) - Rate: {rate:.0f} cells/s")
except Exception as e:
self.logger.error(f"Error rendering table: {str(e)}", exc_info=True)
def _renderTableFastXml(self, doc: Document, headers: List[str], rows: List[List[Any]], styles: Dict[str, Any]) -> None:
"""
High-performance table rendering using direct XML manipulation.
This bypasses python-docx's slow high-level API and builds the table
XML structure directly using lxml, which is 100-1000x faster.
"""
import time
from docx.oxml.shared import OxmlElement, qn
from docx.oxml.ns import nsmap
from lxml import etree
create_start = time.time()
# Get the document body element
body = doc._body._body
# Create table element
tbl = OxmlElement('w:tbl')
# Add table properties
tblPr = OxmlElement('w:tblPr')
# Table width - auto
tblW = OxmlElement('w:tblW')
tblW.set(qn('w:type'), 'auto')
tblW.set(qn('w:w'), '0')
tblPr.append(tblW)
# Center alignment
jc = OxmlElement('w:jc')
jc.set(qn('w:val'), 'center')
tblPr.append(jc)
# Apply table borders directly (works without template styles)
borderStyle = styles.get("table_border", {}).get("style", "grid")
tblBorders = self._createTableBordersXml(borderStyle)
tblPr.append(tblBorders)
# Table cell margins for better readability
tblCellMar = OxmlElement('w:tblCellMar')
for side in ['top', 'left', 'bottom', 'right']:
margin = OxmlElement(f'w:{side}')
margin.set(qn('w:w'), '80') # 80 twips = ~4pt padding
margin.set(qn('w:type'), 'dxa')
tblCellMar.append(margin)
tblPr.append(tblCellMar)
tbl.append(tblPr)
# Create table grid (column definitions)
tblGrid = OxmlElement('w:tblGrid')
for _ in range(len(headers)):
gridCol = OxmlElement('w:gridCol')
tblGrid.append(gridCol)
tbl.append(tblGrid)
self.logger.debug(f"_renderTableFastXml: Table structure created in {time.time() - create_start:.3f}s")
# Build all rows using fast XML
rows_start = time.time()
# Header row
headerRow = self._createTableRowXml(headers, isHeader=True)
tbl.append(headerRow)
header_time = time.time() - rows_start
self.logger.debug(f"_renderTableFastXml: Header row created in {header_time:.3f}s")
# Data rows - batch process for performance
data_start = time.time()
rowCount = len(rows)
for idx, rowData in enumerate(rows):
# Convert all cells to strings
cellTexts = [str(cell) if cell is not None else '' for cell in rowData]
# Pad if needed
while len(cellTexts) < len(headers):
cellTexts.append('')
row = self._createTableRowXml(cellTexts, isHeader=False)
tbl.append(row)
# Log progress every 10%
if rowCount > 100 and (idx + 1) % (rowCount // 10) == 0:
elapsed = time.time() - data_start
rate = (idx + 1) * len(headers) / elapsed if elapsed > 0 else 0
self.logger.debug(f"_renderTableFastXml: Progress {((idx + 1) / rowCount * 100):.0f}% ({idx + 1}/{rowCount} rows) - Rate: {rate:.0f} cells/s")
data_time = time.time() - data_start
# Append table to document body
body.append(tbl)
total_time = time.time() - create_start
totalCells = (rowCount + 1) * len(headers)
rate = totalCells / total_time if total_time > 0 else 0
self.logger.debug(f"_renderTableFastXml: All rows created in {data_time:.2f}s, total: {total_time:.2f}s, rate: {rate:.0f} cells/s")
def _createTableBordersXml(self, borderStyle: str) -> Any:
"""
Create table borders XML element based on style.
Supports:
- 'grid': Full grid with all borders (default)
- 'horizontal_only': Only horizontal lines between rows
- 'none' or other: Minimal/no borders
"""
from docx.oxml.shared import OxmlElement, qn
tblBorders = OxmlElement('w:tblBorders')
# Border color - dark gray for professional look
borderColor = '404040'
borderSize = '4' # 0.5pt (in eighths of a point)
if borderStyle == "grid":
# Full grid - all borders
for borderName in ['top', 'left', 'bottom', 'right', 'insideH', 'insideV']:
border = OxmlElement(f'w:{borderName}')
border.set(qn('w:val'), 'single')
border.set(qn('w:sz'), borderSize)
border.set(qn('w:space'), '0')
border.set(qn('w:color'), borderColor)
tblBorders.append(border)
elif borderStyle == "horizontal_only":
# Only horizontal lines
for borderName in ['top', 'bottom', 'insideH']:
border = OxmlElement(f'w:{borderName}')
border.set(qn('w:val'), 'single')
border.set(qn('w:sz'), borderSize)
border.set(qn('w:space'), '0')
border.set(qn('w:color'), borderColor)
tblBorders.append(border)
# No vertical borders
for borderName in ['left', 'right', 'insideV']:
border = OxmlElement(f'w:{borderName}')
border.set(qn('w:val'), 'nil')
tblBorders.append(border)
else:
# Minimal - just outer border
for borderName in ['top', 'left', 'bottom', 'right']:
border = OxmlElement(f'w:{borderName}')
border.set(qn('w:val'), 'single')
border.set(qn('w:sz'), borderSize)
border.set(qn('w:space'), '0')
border.set(qn('w:color'), borderColor)
tblBorders.append(border)
return tblBorders
def _createTableRowXml(self, cells: List[str], isHeader: bool = False) -> Any:
"""
Create a table row XML element with cells.
This is the core fast-path: builds the row XML directly without
going through python-docx's slow cell.text assignment.
"""
from docx.oxml.shared import OxmlElement, qn
tr = OxmlElement('w:tr')
# Row properties for header
if isHeader:
trPr = OxmlElement('w:trPr')
tblHeader = OxmlElement('w:tblHeader')
trPr.append(tblHeader)
tr.append(trPr)
for cellText in cells:
# Create cell
tc = OxmlElement('w:tc')
# Cell properties
tcPr = OxmlElement('w:tcPr')
tcW = OxmlElement('w:tcW')
tcW.set(qn('w:type'), 'auto')
tcW.set(qn('w:w'), '0')
tcPr.append(tcW)
# Header cell styling - light blue background
if isHeader:
shd = OxmlElement('w:shd')
shd.set(qn('w:val'), 'clear')
shd.set(qn('w:color'), 'auto')
shd.set(qn('w:fill'), '4472C4') # Professional blue
tcPr.append(shd)
tc.append(tcPr)
# Paragraph with text
p = OxmlElement('w:p')
# Add run with text
r = OxmlElement('w:r')
# Header text styling - bold and white
if isHeader:
rPr = OxmlElement('w:rPr')
b = OxmlElement('w:b')
rPr.append(b)
# White text color
color = OxmlElement('w:color')
color.set(qn('w:val'), 'FFFFFF')
rPr.append(color)
r.append(rPr)
# Text element
t = OxmlElement('w:t')
# Preserve spaces if text starts/ends with whitespace
if cellText and (cellText[0] == ' ' or cellText[-1] == ' '):
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
t.text = cellText
r.append(t)
p.append(r)
tc.append(p)
tr.append(tc)
return tr
def _applyHorizontalBordersOnly(self, table) -> None:
"""Apply only horizontal borders to the table (no vertical borders)."""
try:
from docx.oxml.shared import OxmlElement, qn
# Get table properties
tbl_pr = table._element.find(qn('w:tblPr'))
if tbl_pr is None:
tbl_pr = OxmlElement('w:tblPr')
table._element.insert(0, tbl_pr)
# Remove existing borders
existing_borders = tbl_pr.find(qn('w:tblBorders'))
if existing_borders is not None:
tbl_pr.remove(existing_borders)
# Create new borders element
tbl_borders = OxmlElement('w:tblBorders')
# Top border
top_border = OxmlElement('w:top')
top_border.set(qn('w:val'), 'single')
top_border.set(qn('w:sz'), '4')
top_border.set(qn('w:space'), '0')
top_border.set(qn('w:color'), '000000')
tbl_borders.append(top_border)
# Bottom border
bottom_border = OxmlElement('w:bottom')
bottom_border.set(qn('w:val'), 'single')
bottom_border.set(qn('w:sz'), '4')
bottom_border.set(qn('w:space'), '0')
bottom_border.set(qn('w:color'), '000000')
tbl_borders.append(bottom_border)
# Left border - none
left_border = OxmlElement('w:left')
left_border.set(qn('w:val'), 'none')
tbl_borders.append(left_border)
# Right border - none
right_border = OxmlElement('w:right')
right_border.set(qn('w:val'), 'none')
tbl_borders.append(right_border)
# Inside horizontal border
inside_h_border = OxmlElement('w:insideH')
inside_h_border.set(qn('w:val'), 'single')
inside_h_border.set(qn('w:sz'), '4')
inside_h_border.set(qn('w:space'), '0')
inside_h_border.set(qn('w:color'), '000000')
tbl_borders.append(inside_h_border)
# Inside vertical border - none
inside_v_border = OxmlElement('w:insideV')
inside_v_border.set(qn('w:val'), 'none')
tbl_borders.append(inside_v_border)
tbl_pr.append(tbl_borders)
except Exception as e:
self.logger.warning(f"Could not apply horizontal borders: {str(e)}")
def _setCellBackground(self, cell, color: RGBColor) -> None:
"""Set the background color of a table cell."""
try:
from docx.oxml.shared import OxmlElement, qn
# Get cell properties
tc_pr = cell._element.find(qn('w:tcPr'))
if tc_pr is None:
tc_pr = OxmlElement('w:tcPr')
cell._element.insert(0, tc_pr)
# Remove existing shading
existing_shading = tc_pr.find(qn('w:shd'))
if existing_shading is not None:
tc_pr.remove(existing_shading)
# Create new shading element
shading = OxmlElement('w:shd')
shading.set(qn('w:val'), 'clear')
shading.set(qn('w:color'), 'auto')
# Convert RGBColor to hex string by unpacking RGB components
red, green, blue = color
hex_color = f"{red:02x}{green:02x}{blue:02x}"
shading.set(qn('w:fill'), hex_color)
tc_pr.append(shading)
except Exception as e:
self.logger.warning(f"Could not set cell background: {str(e)}")
def _setCellBackgroundFast(self, cell, hex_color: str) -> None:
"""
Set the background color of a table cell using pre-calculated hex string.
PERFORMANCE OPTIMIZED: Avoids RGBColor unpacking and string formatting in hot loop.
"""
try:
from docx.oxml.shared import OxmlElement, qn
# Get cell properties
tc_pr = cell._element.find(qn('w:tcPr'))
if tc_pr is None:
tc_pr = OxmlElement('w:tcPr')
cell._element.insert(0, tc_pr)
# Remove existing shading
existing_shading = tc_pr.find(qn('w:shd'))
if existing_shading is not None:
tc_pr.remove(existing_shading)
# Create new shading element with pre-calculated hex color
shading = OxmlElement('w:shd')
shading.set(qn('w:val'), 'clear')
shading.set(qn('w:color'), 'auto')
shading.set(qn('w:fill'), hex_color)
tc_pr.append(shading)
except Exception as e:
self.logger.warning(f"Could not set cell background: {str(e)}")
def _renderJsonBulletList(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON bullet list to DOCX using AI-generated styles - OPTIMIZED for performance."""
try:
# Extract from nested content structure
content = list_data.get("content", {})
if not isinstance(content, dict):
return
items = content.get("items", [])
bullet_style = styles.get("bullet_list", {})
# Pre-calculate and cache style objects to avoid repeated parsing
font_size_pt = None
text_color_rgb = None
if bullet_style:
if "font_size" in bullet_style:
font_size_pt = Pt(bullet_style["font_size"])
if "color" in bullet_style:
color_hex = bullet_style["color"].lstrip('#')
text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
for item in items:
if isinstance(item, str):
para = doc.add_paragraph(item, style='List Bullet')
elif isinstance(item, dict) and "text" in item:
para = doc.add_paragraph(item["text"], style='List Bullet')
# Apply bullet list styling from style set - use cached objects
if bullet_style and para.runs:
# Use direct access instead of iterating
if len(para.runs) > 0:
run = para.runs[0]
if font_size_pt:
run.font.size = font_size_pt
if text_color_rgb:
run.font.color.rgb = text_color_rgb
else:
# Create run if none exists
run = para.add_run()
if font_size_pt:
run.font.size = font_size_pt
if text_color_rgb:
run.font.color.rgb = text_color_rgb
except Exception as e:
self.logger.warning(f"Error rendering bullet list: {str(e)}")
def _renderJsonHeading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON heading to DOCX using AI-generated styles."""
try:
# Extract from nested content structure
content = heading_data.get("content", {})
if not isinstance(content, dict):
return
text = content.get("text", "")
level = content.get("level", 1)
if text:
level = max(1, min(6, level))
# Use custom heading style if available, otherwise use built-in
style_name = f"Heading {level}" if level <= 2 else "Heading 1"
try:
para = doc.add_paragraph(text, style=style_name)
except KeyError:
# Fallback to built-in heading if custom style doesn't exist
doc.add_heading(text, level=level)
except Exception as e:
self.logger.warning(f"Error rendering heading: {str(e)}")
def _renderJsonParagraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON paragraph to DOCX using AI-generated styles."""
try:
# Extract from nested content structure
content = paragraph_data.get("content", {})
if isinstance(content, dict):
text = content.get("text", "")
elif isinstance(content, str):
text = content
else:
text = ""
# CRITICAL: Prevent rendering base64 image data as text
# Base64 image data typically starts with /9j/ (JPEG) or iVBORw0KGgo (PNG)
if text and (text.startswith("/9j/") or text.startswith("iVBORw0KGgo") or
(len(text) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in text[:100]))):
# This looks like base64 data - don't render as text
self.logger.warning(f"Skipping rendering of what appears to be base64 data in paragraph (length: {len(text)})")
para = doc.add_paragraph("[Error: Image data found in text content - image embedding may have failed]")
if para.runs:
para.runs[0].font.color.rgb = RGBColor(255, 0, 0) # Red color for error
return
if text:
para = doc.add_paragraph(text)
# Apply paragraph styling from style set - OPTIMIZED: pre-calculate style objects
paragraph_style = styles.get("paragraph", {})
if paragraph_style:
# Pre-calculate and cache style objects
font_size_pt = None
text_color_rgb = None
if "font_size" in paragraph_style:
font_size_pt = Pt(paragraph_style["font_size"])
if "color" in paragraph_style:
color_hex = paragraph_style["color"].lstrip('#')
text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
bold = paragraph_style.get("bold", False)
# Use direct access instead of iterating
if len(para.runs) > 0:
run = para.runs[0]
if font_size_pt:
run.font.size = font_size_pt
run.font.bold = bold
if text_color_rgb:
run.font.color.rgb = text_color_rgb
else:
# Create run if none exists
run = para.add_run()
if font_size_pt:
run.font.size = font_size_pt
run.font.bold = bold
if text_color_rgb:
run.font.color.rgb = text_color_rgb
if "align" in paragraph_style:
align = paragraph_style["align"]
if align == "center":
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif align == "right":
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
else:
para.alignment = WD_ALIGN_PARAGRAPH.LEFT
except Exception as e:
self.logger.warning(f"Error rendering paragraph: {str(e)}")
def _renderJsonCodeBlock(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON code block to DOCX using AI-generated styles."""
try:
# Extract from nested content structure
content = code_data.get("content", {})
if not isinstance(content, dict):
return
code = content.get("code", "")
language = content.get("language", "")
code_style = styles.get("code_block", {})
if code:
if language:
lang_para = doc.add_paragraph(f"Code ({language}):")
if len(lang_para.runs) > 0:
lang_para.runs[0].bold = True
# Pre-calculate and cache style objects
code_font_name = code_style.get("font", "Courier New")
code_font_size_pt = Pt(code_style.get("font_size", 9))
code_text_color_rgb = None
if "color" in code_style:
color_hex = code_style["color"].lstrip('#')
code_text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
code_para = doc.add_paragraph(code)
# Use direct access instead of iterating
if len(code_para.runs) > 0:
run = code_para.runs[0]
run.font.name = code_font_name
run.font.size = code_font_size_pt
if code_text_color_rgb:
run.font.color.rgb = code_text_color_rgb
else:
# Create run if none exists
run = code_para.add_run()
run.font.name = code_font_name
run.font.size = code_font_size_pt
if code_text_color_rgb:
run.font.color.rgb = code_text_color_rgb
except Exception as e:
self.logger.warning(f"Error rendering code block: {str(e)}")
def _renderJsonImage(self, doc: Document, image_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON image to DOCX."""
try:
# Extract from nested content structure
content = image_data.get("content", {})
base64_data = ""
alt_text = "Image"
caption = ""
if isinstance(content, dict):
base64_data = content.get("base64Data", "")
alt_text = content.get("altText", "Image")
caption = content.get("caption", "")
elif isinstance(content, str):
# Content might be base64 string directly (shouldn't happen, but handle it)
self.logger.warning("Image content is a string, not a dict. This should not happen.")
return
# If base64Data not found in content, try direct element fields (fallback)
if not base64_data:
base64_data = image_data.get("base64Data", "")
if not alt_text or alt_text == "Image":
alt_text = image_data.get("altText", "Image")
if not caption:
caption = image_data.get("caption", "")
# CRITICAL: Ensure we don't render base64 data as text
# If base64_data looks like it might be rendered elsewhere, skip it
if not base64_data:
raise Exception("No image data provided (base64Data is empty)")
try:
image_bytes = base64.b64decode(base64_data)
image_stream = io.BytesIO(image_bytes)
# Get image dimensions to calculate proper size
try:
from PIL import Image as PILImage
pil_image = PILImage.open(image_stream)
img_width_px, img_height_px = pil_image.size
# DOCX page width is typically 8.5 inches, usable width ~6.5 inches with margins
# Standard margins: 1 inch left/right, so usable width = 6.5 inches
max_width_inches = 6.5
max_height_inches = 9.0 # Leave room for text above/below
# Calculate scale factor to fit within page dimensions
# Convert pixels to inches (assuming 96 DPI for modern displays, but images may vary)
# Use conservative estimate: 1 inch = 96 pixels
img_width_inches = img_width_px / 96.0
img_height_inches = img_height_px / 96.0
# Calculate scale to fit
width_scale = max_width_inches / img_width_inches if img_width_inches > max_width_inches else 1.0
height_scale = max_height_inches / img_height_inches if img_height_inches > max_height_inches else 1.0
scale = min(width_scale, height_scale, 1.0) # Don't scale up, only down
final_width = img_width_inches * scale
final_height = img_height_inches * scale
# Reset stream for docx
image_stream.seek(0)
doc.add_picture(image_stream, width=Inches(final_width))
except Exception:
# Fallback: use conservative default size if PIL fails
image_stream.seek(0)
doc.add_picture(image_stream, width=Inches(6.0))
# Use caption from section if available, otherwise use alt_text
if caption:
caption_text = caption
elif alt_text and alt_text != "Image":
# Only use alt_text if it doesn't look like a usageHint
if "Render as visual element:" in alt_text:
# Extract filename from usageHint if possible
parts = alt_text.split("Render as visual element:")
if len(parts) > 1:
filename = parts[1].strip()
caption_text = f"Figure: {filename}"
else:
caption_text = alt_text
else:
caption_text = f"Figure: {alt_text}"
else:
caption_text = None
if caption_text:
caption_para = doc.add_paragraph(caption_text)
caption_para.runs[0].italic = True
except Exception as embedError:
# Image decoding or embedding failed
raise Exception(f"Failed to decode or embed image: {str(embedError)}")
except Exception as e:
self.logger.error(f"Error embedding image in DOCX: {str(e)}")
errorMsg = f"[Error: Could not embed image '{image_data.get('altText', 'Image')}'. {str(e)}]"
errorPara = doc.add_paragraph(errorMsg)
if errorPara.runs:
errorPara.runs[0].font.color.rgb = RGBColor(255, 0, 0) # Red color for error
def _extractStructureFromPrompt(self, userPrompt: str, title: str) -> Dict[str, Any]:
"""Extract document structure from user prompt."""
structure = {
'title': title,
'sections': [],
'format': 'standard'
}
if not userPrompt:
return structure
# Extract title from prompt if not provided
if not title or title == "Generated Document":
# Look for "create a ... document" or "generate a ... report"
title_match = re.search(r'(?:create|generate|make)\s+a\s+([^,]+?)(?:\s+document|\s+report|\s+summary)', userPrompt.lower())
if title_match:
structure['title'] = title_match.group(1).strip().title()
# Extract sections from numbered lists in prompt
section_pattern = r'(\d+)\)?\s*([^,]+?)(?:\s*[,:]|\s*$)'
sections = re.findall(section_pattern, userPrompt)
for num, section_text in sections:
structure['sections'].append({
'number': int(num),
'title': section_text.strip(),
'level': 2 # H2 level
})
# If no numbered sections found, try to extract from "including:" patterns
if not structure['sections']:
including_match = re.search(r'including:\s*(.+?)(?:\.|$)', userPrompt, re.DOTALL)
if including_match:
including_text = including_match.group(1)
# Split by common separators
parts = re.split(r'[,;]\s*', including_text)
for i, part in enumerate(parts, 1):
part = part.strip()
if part:
structure['sections'].append({
'number': i,
'title': part,
'level': 2
})
# If still no sections, extract from any list-like patterns
if not structure['sections']:
# Look for bullet points or dashes
bullet_pattern = r'[-•]\s*([^,\n]+?)(?:\s*[,:]|\s*$)'
bullets = re.findall(bullet_pattern, userPrompt)
for i, bullet in enumerate(bullets, 1):
bullet = bullet.strip()
if bullet and len(bullet) > 3:
structure['sections'].append({
'number': i,
'title': bullet,
'level': 2
})
# If still no sections, extract from sentence structure
if not structure['sections']:
# Split prompt into sentences and use as sections
sentences = re.split(r'[.!?]\s+', userPrompt)
for i, sentence in enumerate(sentences[:5], 1): # Max 5 sections
sentence = sentence.strip()
if sentence and len(sentence) > 10 and not sentence.startswith(('Analyze', 'Create', 'Generate')):
structure['sections'].append({
'number': i,
'title': sentence[:50] + "..." if len(sentence) > 50 else sentence,
'level': 2
})
# Final fallback: create sections from prompt keywords
if not structure['sections']:
# Extract key action words from prompt
action_words = ['analyze', 'summarize', 'review', 'assess', 'evaluate', 'examine', 'investigate']
found_actions = []
for action in action_words:
if action in userPrompt.lower():
found_actions.append(action.title())
if found_actions:
for i, action in enumerate(found_actions[:3], 1):
structure['sections'].append({
'number': i,
'title': f"{action} Document Content",
'level': 2
})
else:
# Last resort: generic but meaningful sections
structure['sections'] = [
{'number': 1, 'title': 'Document Analysis', 'level': 2},
{'number': 2, 'title': 'Key Information', 'level': 2},
{'number': 3, 'title': 'Summary and Conclusions', 'level': 2}
]
return structure
def _generateFromStructure(self, doc, content: str, structure: Dict[str, Any]):
"""Generate DOCX content based on extracted structure."""
# Add sections based on prompt structure
for section in structure['sections']:
# Add section heading
doc.add_heading(f"{section['number']}) {section['title']}", level=section['level'])
# Add AI-generated content for this section
# Try to extract relevant content for this section from the AI response
section_content = self._extractSectionContent(content, section['title'])
if section_content:
doc.add_paragraph(section_content)
else:
# If no specific content found, add a note
doc.add_paragraph(f"Content for {section['title']} based on document analysis.")
# Add some spacing
doc.add_paragraph()
# Add the complete AI-generated content as additional analysis
if content and content.strip():
doc.add_heading("Complete Analysis", level=1)
doc.add_paragraph(content)
def _extractSectionContent(self, content: str, section_title: str) -> str:
"""Extract relevant content for a specific section from AI response."""
if not content or not section_title:
return ""
# Look for content that matches the section title
section_keywords = section_title.lower().split()
# Split content into paragraphs
paragraphs = content.split('\n\n')
relevant_paragraphs = []
for paragraph in paragraphs:
paragraph_lower = paragraph.lower()
# Check if paragraph contains keywords from section title
if any(keyword in paragraph_lower for keyword in section_keywords if len(keyword) > 3):
relevant_paragraphs.append(paragraph.strip())
if relevant_paragraphs:
return '\n\n'.join(relevant_paragraphs[:2]) # Max 2 paragraphs per section
return ""
def _setupDocumentStyles(self, doc: Document, styleSet: Dict[str, Any]) -> None:
"""Create all styles in document from style set.
Creates styles BEFORE rendering so they're available for use.
"""
try:
from docx.enum.style import WD_STYLE_TYPE
# Create Title style
if "title" in styleSet:
self._createStyle(doc, "Title", styleSet["title"], WD_STYLE_TYPE.PARAGRAPH)
# Create Heading styles (Heading 1, Heading 2)
if "heading1" in styleSet:
self._createStyle(doc, "Heading 1", styleSet["heading1"], WD_STYLE_TYPE.PARAGRAPH)
if "heading2" in styleSet:
self._createStyle(doc, "Heading 2", styleSet["heading2"], WD_STYLE_TYPE.PARAGRAPH)
# Create Paragraph style
if "paragraph" in styleSet:
self._createStyle(doc, "Custom Paragraph", styleSet["paragraph"], WD_STYLE_TYPE.PARAGRAPH)
# Note: List Bullet and List Number are built-in Word styles, but we apply custom styling to runs
except Exception as e:
self.logger.warning(f"Could not set up document styles: {str(e)}")
def _createStyle(self, doc: Document, styleName: str, styleConfig: Dict[str, Any], styleType) -> None:
"""Create or update a style in the document styles collection."""
try:
from docx.enum.style import WD_STYLE_TYPE
# Try to get existing style, or create new one
try:
doc_style = doc.styles[styleName]
except KeyError:
# Create new style based on Normal
doc_style = doc.styles.add_style(styleName, styleType)
# Base it on Normal style
doc_style.base_style = doc.styles['Normal']
# Apply font configuration
font = doc_style.font
if "font_size" in styleConfig:
font.size = Pt(styleConfig["font_size"])
if "bold" in styleConfig:
font.bold = styleConfig["bold"]
if "color" in styleConfig:
color_hex = styleConfig["color"].lstrip('#')
font.color.rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
if "font" in styleConfig:
font.name = styleConfig["font"]
# Set paragraph formatting for alignment
if "align" in styleConfig:
para_format = doc_style.paragraph_format
align = styleConfig["align"]
if align == "center":
para_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif align == "right":
para_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
else:
para_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
except Exception as e:
self.logger.warning(f"Could not create style '{styleName}': {str(e)}")
def _processSection(self, doc, lines: list):
"""Process a section of content into DOCX elements."""
for line in lines:
if not line.strip():
continue
# Check for tables (lines with |)
if '|' in line and not line.startswith('|'):
# This might be part of a table, process as table
table_data = self._extractTableData(lines)
if table_data:
self._addTable(doc, table_data)
return
# Check for lists
if line.startswith('- ') or line.startswith('* '):
# This is a list item
doc.add_paragraph(line[2:], style='List Bullet')
elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
# This is a numbered list item
doc.add_paragraph(line[3:], style='List Number')
else:
# Regular paragraph
doc.add_paragraph(line)
def _extractTableData(self, lines: list) -> list:
"""Extract table data from lines."""
table_data = []
in_table = False
for line in lines:
if '|' in line:
if not in_table:
in_table = True
# Split by | and clean up
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
if cells:
table_data.append(cells)
elif in_table and not line.strip():
# Empty line, might be end of table
break
return table_data if len(table_data) > 1 else []
def _addTable(self, doc, table_data: list):
"""Add a table to the document."""
try:
if not table_data:
return
# Create table
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
table.alignment = WD_TABLE_ALIGNMENT.CENTER
# Add data to table
for row_idx, row_data in enumerate(table_data):
for col_idx, cell_data in enumerate(row_data):
if col_idx < len(table.rows[row_idx].cells):
table.rows[row_idx].cells[col_idx].text = cell_data
# Style the table
self._styleTable(table)
except Exception as e:
self.logger.warning(f"Could not add table: {str(e)}")
def _styleTable(self, table):
"""Apply styling to the table."""
try:
# Style header row
if len(table.rows) > 0:
header_cells = table.rows[0].cells
for cell in header_cells:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
except Exception as e:
self.logger.warning(f"Could not style table: {str(e)}")
def _processTableRow(self, doc, line: str):
"""Process a table row and add it to the document."""
if not line.strip():
return
# Split by pipe separator
parts = [part.strip() for part in line.split('|')]
if len(parts) >= 2:
# This is a table row - create a table if it doesn't exist
if not hasattr(self, '_current_table') or self._current_table is None:
# Create new table
self._current_table = doc.add_table(rows=1, cols=len(parts))
self._current_table.style = 'Table Grid'
# Add header row
for i, part in enumerate(parts):
if i < len(self._current_table.rows[0].cells):
cell = self._current_table.rows[0].cells[i]
cell.text = part
# Make header bold
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
else:
# Add data row to existing table
row = self._current_table.add_row()
for i, part in enumerate(parts):
if i < len(row.cells):
row.cells[i].text = part
else:
# Not a table row, treat as regular text
doc.add_paragraph(line)
def _cleanAiContent(self, content: str) -> str:
"""Clean AI-generated content by removing debug information and duplicates."""
if not content:
return ""
# Remove debug information
lines = content.split('\n')
clean_lines = []
for line in lines:
# Skip debug lines and separators
if (line.startswith('[Skipped ') or
line.startswith('=== DOCUMENT:') or
line.startswith('---') or
line.startswith('FILENAME:') or
line.strip() == '' or
line.strip() == '---'):
continue
clean_lines.append(line)
# Join lines and remove duplicate content
clean_content = '\n'.join(clean_lines)
# Remove duplicate sections by keeping only the first occurrence
sections = clean_content.split('\n\n')
seen_sections = set()
unique_sections = []
for section in sections:
section_key = section.strip()[:50] # Use first 50 chars as key
if section_key not in seen_sections and section.strip():
seen_sections.add(section_key)
unique_sections.append(section)
return '\n\n'.join(unique_sections)
def _processTables(self, doc, content: str) -> str:
"""
Process tables in the content (both CSV and pipe-separated) and convert them to Word tables.
Returns the content with tables replaced by placeholders.
"""
# csv is already imported at module level
lines = content.split('\n')
processed_lines = []
i = 0
while i < len(lines):
line = lines[i].strip()
# Check if this line looks like a table (contains pipes or commas with multiple fields)
is_pipe_table = '|' in line and len(line.split('|')) >= 2
is_csv_table = ',' in line and len(line.split(',')) >= 2
if is_pipe_table or is_csv_table:
# Collect consecutive table lines
table_lines = []
j = i
# Determine separator and collect lines
separator = '|' if is_pipe_table else ','
while j < len(lines):
current_line = lines[j].strip()
if separator in current_line and len(current_line.split(separator)) >= 2:
table_lines.append(current_line)
j += 1
else:
break
if len(table_lines) >= 2: # At least header + 1 data row
# Create Word table
try:
if separator == '|':
# Process pipe-separated table
rows = []
for table_line in table_lines:
# Split by pipe and clean up
cells = [cell.strip() for cell in table_line.split('|')]
rows.append(cells)
else:
# Process CSV table
csv_content = '\n'.join(table_lines)
csv_reader = csv.reader(io.StringIO(csv_content))
rows = list(csv_reader)
if rows and len(rows[0]) > 0:
# Create Word table
table = doc.add_table(rows=len(rows), cols=len(rows[0]))
table.style = 'Table Grid'
# Populate table
for row_idx, row_data in enumerate(rows):
for col_idx, cell_data in enumerate(row_data):
if col_idx < len(table.rows[row_idx].cells):
table.rows[row_idx].cells[col_idx].text = cell_data.strip()
# Make header row bold
if row_idx == 0:
for cell in table.rows[row_idx].cells:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
# Add placeholder to mark where table was inserted
processed_lines.append(f"[TABLE_INSERTED_{len(processed_lines)}]")
# Skip the table lines
i = j
continue
except Exception as e:
# If table parsing fails, treat as regular text
pass
processed_lines.append(line)
i += 1
return '\n'.join(processed_lines)
def _parseAndFormatContent(self, doc, content: str, title: str):
"""Parse AI-generated content in standardized format and apply proper DOCX formatting."""
if not content:
return
# Process tables and replace them with placeholders
content = self._processTables(doc, content)
# Parse content line by line in exact sequence
lines = content.split('\n')
for line in lines:
line = line.strip()
if not line:
# Empty line - add paragraph break
doc.add_paragraph()
continue
# Skip table placeholders (already processed)
if line.startswith('[TABLE_INSERTED_'):
continue
# Check if this is a Markdown heading (# ## ###)
if line.startswith('#'):
level = len(line) - len(line.lstrip('#'))
heading_text = line.lstrip('# ').strip()
doc.add_heading(heading_text, level=min(level, 3))
# Check if this is a numbered heading (1) Title, 2) Title, etc.)
elif re.match(r'^\d+\)\s+.+', line):
heading_text = re.sub(r'^\d+\)\s+', '', line)
doc.add_heading(heading_text, level=1)
# Check if this is a Markdown list item
elif line.startswith('- ') or re.match(r'^\d+\.\s+', line):
bullet_text = re.sub(r'^[-•]\s+|\d+\.\s+', '', line)
self._add_bullet_point(doc, bullet_text)
# Check if this is a code block
elif line.startswith('```'):
if not line.endswith('```'):
# Start of code block - collect until end
code_lines = [line]
continue
else:
# End of code block
if 'code_lines' in locals():
code_lines.append(line)
code_text = '\n'.join(code_lines)
para = doc.add_paragraph()
run = para.add_run(code_text)
run.font.name = 'Courier New'
del code_lines
# Regular paragraph
else:
self._addParagraphToDoc(doc, line)
def _addParagraphToDoc(self, doc, text: str):
"""Add a paragraph to the document with proper formatting."""
if not text.strip():
return
# Check for Markdown formatting (**bold**, *italic*)
para = doc.add_paragraph()
# Split by bold markers
parts = text.split('**')
for i, part in enumerate(parts):
if i % 2 == 0:
# Regular text - check for italic
italic_parts = part.split('*')
for j, italic_part in enumerate(italic_parts):
if j % 2 == 0:
# Regular text
if italic_part:
para.add_run(italic_part)
else:
# Italic text
if italic_part:
run = para.add_run(italic_part)
run.italic = True
else:
# Bold text
if part:
run = para.add_run(part)
run.bold = True