1649 lines
No EOL
74 KiB
Python
1649 lines
No EOL
74 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
||
# All rights reserved.
|
||
"""
|
||
DOCX renderer for report generation using python-docx.
|
||
"""
|
||
|
||
from .documentRendererBaseTemplate import BaseRenderer
|
||
from modules.datamodels.datamodelDocument import RenderedDocument
|
||
from typing import Dict, Any, List, Optional
|
||
import io
|
||
import base64
|
||
import re
|
||
import csv
|
||
|
||
try:
|
||
from docx import Document
|
||
from docx.shared import Inches, Pt, RGBColor
|
||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
from docx.enum.table import WD_TABLE_ALIGNMENT
|
||
DOCX_AVAILABLE = True
|
||
except ImportError:
|
||
DOCX_AVAILABLE = False
|
||
|
||
class RendererDocx(BaseRenderer):
|
||
"""Renders content to DOCX format using python-docx."""
|
||
|
||
@classmethod
|
||
def getSupportedFormats(cls) -> List[str]:
|
||
"""Return supported DOCX formats."""
|
||
return ['docx', 'doc']
|
||
|
||
@classmethod
|
||
def getFormatAliases(cls) -> List[str]:
|
||
"""Return format aliases."""
|
||
return ['word', 'document']
|
||
|
||
@classmethod
|
||
def getPriority(cls) -> int:
|
||
"""Return priority for DOCX renderer."""
|
||
return 115
|
||
|
||
@classmethod
|
||
def getOutputStyle(cls, formatName: Optional[str] = None) -> str:
|
||
"""Return output style classification: Word documents are formatted documents."""
|
||
return 'document'
|
||
|
||
@classmethod
|
||
def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]:
|
||
"""
|
||
Return list of section content types that DOCX renderer accepts.
|
||
DOCX renderer accepts all section types (Word documents can contain all content types).
|
||
"""
|
||
from modules.datamodels.datamodelJson import supportedSectionTypes
|
||
return list(supportedSectionTypes)
|
||
|
||
async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]:
|
||
"""Render extracted JSON content to DOCX format using AI-analyzed styling."""
|
||
self.services.utils.debugLogToFile(f"DOCX RENDER CALLED: title={title}, user_prompt={userPrompt[:50] if userPrompt else 'None'}...", "DOCX_RENDERER")
|
||
try:
|
||
if not DOCX_AVAILABLE:
|
||
# Fallback to HTML if python-docx not available
|
||
from .rendererHtml import RendererHtml
|
||
htmlRenderer = RendererHtml()
|
||
return await htmlRenderer.render(extractedContent, title, userPrompt, aiService)
|
||
|
||
# Generate DOCX using AI-analyzed styling
|
||
docx_content = await self._generateDocxFromJson(extractedContent, title, userPrompt, aiService)
|
||
|
||
# Extract metadata for document type and other info
|
||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||
|
||
# Determine filename from document or title
|
||
documents = extractedContent.get("documents", [])
|
||
if documents and isinstance(documents[0], dict):
|
||
filename = documents[0].get("filename")
|
||
if not filename:
|
||
filename = self._determineFilename(title, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||
else:
|
||
filename = self._determineFilename(title, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||
|
||
# Convert DOCX content to bytes if it's a string (base64)
|
||
if isinstance(docx_content, str):
|
||
try:
|
||
docx_bytes = base64.b64decode(docx_content)
|
||
except Exception:
|
||
docx_bytes = docx_content.encode('utf-8')
|
||
else:
|
||
docx_bytes = docx_content
|
||
|
||
return [
|
||
RenderedDocument(
|
||
documentData=docx_bytes,
|
||
mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||
filename=filename,
|
||
documentType=documentType,
|
||
metadata=metadata if isinstance(metadata, dict) else None
|
||
)
|
||
]
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"Error rendering DOCX: {str(e)}")
|
||
# Return minimal fallback
|
||
fallbackContent = f"DOCX Generation Error: {str(e)}"
|
||
metadata = extractedContent.get("metadata", {}) if extractedContent else {}
|
||
documentType = metadata.get("documentType") if isinstance(metadata, dict) else None
|
||
return [
|
||
RenderedDocument(
|
||
documentData=fallbackContent.encode('utf-8'),
|
||
mimeType="text/plain",
|
||
filename=self._determineFilename(title, "text/plain"),
|
||
documentType=documentType,
|
||
metadata=metadata if isinstance(metadata, dict) else None
|
||
)
|
||
]
|
||
|
||
async def _generateDocxFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
|
||
"""Generate DOCX content from structured JSON document."""
|
||
import time
|
||
start_time = time.time()
|
||
try:
|
||
self.logger.debug("_generateDocxFromJson: Starting document generation")
|
||
# Create new document
|
||
doc = Document()
|
||
self.logger.debug(f"_generateDocxFromJson: Document created in {time.time() - start_time:.2f}s")
|
||
|
||
# Get style set: use styles from metadata if available, otherwise enhance with AI
|
||
style_start = time.time()
|
||
self.logger.debug("_generateDocxFromJson: About to get style set")
|
||
styleSet = await self._getStyleSet(json_content, userPrompt, aiService)
|
||
self.logger.debug(f"_generateDocxFromJson: Style set retrieved in {time.time() - style_start:.2f}s")
|
||
|
||
# Setup basic document styles and create all styles from style set
|
||
setup_start = time.time()
|
||
self.logger.debug("_generateDocxFromJson: Setting up document styles")
|
||
self._setupBasicDocumentStyles(doc)
|
||
self._setupDocumentStyles(doc, styleSet)
|
||
self.logger.debug(f"_generateDocxFromJson: Document styles setup in {time.time() - setup_start:.2f}s")
|
||
|
||
# Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]})
|
||
if not self._validateJsonStructure(json_content):
|
||
raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}")
|
||
|
||
# Extract sections and metadata from standardized schema
|
||
extract_start = time.time()
|
||
self.logger.debug("_generateDocxFromJson: Extracting sections and metadata")
|
||
sections = self._extractSections(json_content)
|
||
metadata = self._extractMetadata(json_content)
|
||
self.logger.debug(f"_generateDocxFromJson: Extracted {len(sections)} sections in {time.time() - extract_start:.2f}s")
|
||
|
||
# Use provided title (which comes from documents[].title) as primary source
|
||
# Fallback to metadata.title only if title parameter is empty
|
||
document_title = title if title else metadata.get("title", "Generated Document")
|
||
|
||
# Add document title using Title style
|
||
if document_title:
|
||
doc.add_paragraph(document_title, style='Title')
|
||
|
||
# Process each section in order
|
||
render_start = time.time()
|
||
self.logger.debug(f"_generateDocxFromJson: Starting to render {len(sections)} sections")
|
||
for idx, section in enumerate(sections):
|
||
section_start = time.time()
|
||
self.logger.debug(f"_generateDocxFromJson: Rendering section {idx + 1}/{len(sections)}")
|
||
self._renderJsonSection(doc, section, styleSet)
|
||
self.logger.debug(f"_generateDocxFromJson: Section {idx + 1} rendered in {time.time() - section_start:.2f}s")
|
||
self.logger.debug(f"_generateDocxFromJson: All sections rendered in {time.time() - render_start:.2f}s")
|
||
|
||
# Save to buffer
|
||
save_start = time.time()
|
||
self.logger.debug("_generateDocxFromJson: Starting to save document to buffer")
|
||
buffer = io.BytesIO()
|
||
doc.save(buffer)
|
||
buffer.seek(0)
|
||
self.logger.debug(f"_generateDocxFromJson: Document saved to buffer in {time.time() - save_start:.2f}s")
|
||
|
||
# Convert to base64
|
||
encode_start = time.time()
|
||
self.logger.debug("_generateDocxFromJson: Converting to base64")
|
||
docx_bytes = buffer.getvalue()
|
||
docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')
|
||
self.logger.debug(f"_generateDocxFromJson: Converted to base64 in {time.time() - encode_start:.2f}s (document size: {len(docx_bytes)} bytes)")
|
||
|
||
total_time = time.time() - start_time
|
||
self.logger.info(f"_generateDocxFromJson: Document generation completed in {total_time:.2f}s")
|
||
return docx_base64
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"Error generating DOCX from JSON: {str(e)}")
|
||
raise Exception(f"DOCX generation failed: {str(e)}")
|
||
|
||
async def _getStyleSet(self, extractedContent: Dict[str, Any] = None, userPrompt: str = None, aiService=None, templateName: str = None) -> Dict[str, Any]:
|
||
"""Get style set - use styles from document generation metadata if available,
|
||
otherwise enhance default styles with AI if userPrompt provided.
|
||
|
||
WICHTIG: In a dynamic scalable AI system, styling should come from document generation,
|
||
not be generated separately by renderers. Only fall back to AI if styles not provided.
|
||
|
||
Args:
|
||
extractedContent: Document content with metadata (may contain styles)
|
||
userPrompt: User's prompt (AI will detect style instructions in any language)
|
||
aiService: AI service (used only if styles not in metadata and userPrompt provided)
|
||
templateName: Name of template style set (None = default)
|
||
|
||
Returns:
|
||
Dict with style definitions for all document styles
|
||
"""
|
||
# Get default style set
|
||
if templateName == "corporate":
|
||
defaultStyleSet = self._getCorporateStyleSet()
|
||
elif templateName == "minimal":
|
||
defaultStyleSet = self._getMinimalStyleSet()
|
||
else:
|
||
defaultStyleSet = self._getDefaultStyleSet()
|
||
|
||
# FIRST: Check if styles are provided in document generation metadata (preferred approach)
|
||
if extractedContent:
|
||
metadata = extractedContent.get("metadata", {})
|
||
if isinstance(metadata, dict):
|
||
styles = metadata.get("styles")
|
||
if styles and isinstance(styles, dict):
|
||
self.logger.debug("Using styles from document generation metadata")
|
||
return self._validateStylesContrast(styles)
|
||
|
||
# FALLBACK: Enhance with AI if userPrompt provided (only if styles not in metadata)
|
||
if userPrompt and aiService:
|
||
self.logger.info(f"Styles not in metadata, enhancing with AI based on user prompt...")
|
||
enhancedStyleSet = await self._enhanceStylesWithAI(userPrompt, defaultStyleSet, aiService)
|
||
return self._validateStylesContrast(enhancedStyleSet)
|
||
else:
|
||
# Use default styles only
|
||
return defaultStyleSet
|
||
|
||
async def _enhanceStylesWithAI(self, userPrompt: str, defaultStyleSet: Dict[str, Any], aiService) -> Dict[str, Any]:
|
||
"""Enhance default styles with AI based on user prompt."""
|
||
try:
|
||
style_template = self._createAiStyleTemplate("docx", userPrompt, defaultStyleSet)
|
||
enhanced_styles = await self._getAiStyles(aiService, style_template, defaultStyleSet)
|
||
return enhanced_styles
|
||
except Exception as e:
|
||
self.logger.warning(f"AI style enhancement failed: {str(e)}, using default styles")
|
||
return defaultStyleSet
|
||
|
||
def _validateStylesContrast(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""Validate and fix contrast issues in AI-generated styles."""
|
||
try:
|
||
# Fix table header contrast
|
||
if "table_header" in styles:
|
||
header = styles["table_header"]
|
||
bg_color = header.get("background", "#FFFFFF")
|
||
text_color = header.get("text_color", "#000000")
|
||
|
||
# If both are white or both are dark, fix it
|
||
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||
header["background"] = "#4F4F4F"
|
||
header["text_color"] = "#FFFFFF"
|
||
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||
header["background"] = "#4F4F4F"
|
||
header["text_color"] = "#FFFFFF"
|
||
|
||
# Fix table cell contrast
|
||
if "table_cell" in styles:
|
||
cell = styles["table_cell"]
|
||
bg_color = cell.get("background", "#FFFFFF")
|
||
text_color = cell.get("text_color", "#000000")
|
||
|
||
# If both are white or both are dark, fix it
|
||
if bg_color.upper() == "#FFFFFF" and text_color.upper() == "#FFFFFF":
|
||
cell["background"] = "#FFFFFF"
|
||
cell["text_color"] = "#2F2F2F"
|
||
elif bg_color.upper() == "#000000" and text_color.upper() == "#000000":
|
||
cell["background"] = "#FFFFFF"
|
||
cell["text_color"] = "#2F2F2F"
|
||
|
||
return styles
|
||
|
||
except Exception as e:
|
||
self.logger.warning(f"Style validation failed: {str(e)}")
|
||
return self._getDefaultStyleSet()
|
||
|
||
def _getDefaultStyleSet(self) -> Dict[str, Any]:
|
||
"""Default DOCX style set - used when no style instructions present."""
|
||
return {
|
||
"title": {"font_size": 24, "color": "#1F4E79", "bold": True, "align": "left"},
|
||
"heading1": {"font_size": 18, "color": "#2F2F2F", "bold": True, "align": "left"},
|
||
"heading2": {"font_size": 14, "color": "#4F4F4F", "bold": True, "align": "left"},
|
||
"paragraph": {"font_size": 11, "color": "#2F2F2F", "bold": False, "align": "left"},
|
||
"table_header": {"background": "#4F4F4F", "text_color": "#FFFFFF", "bold": True, "align": "center"},
|
||
"table_cell": {"background": "#FFFFFF", "text_color": "#2F2F2F", "bold": False, "align": "left"},
|
||
"table_border": {"style": "horizontal_only", "color": "#000000", "thickness": "thin"},
|
||
"bullet_list": {"font_size": 11, "color": "#2F2F2F", "indent": 20},
|
||
"code_block": {"font": "Courier New", "font_size": 10, "color": "#2F2F2F", "background": "#F5F5F5"}
|
||
}
|
||
|
||
def _setupBasicDocumentStyles(self, doc: Document) -> None:
|
||
"""Set up basic document styles."""
|
||
try:
|
||
# Set default font
|
||
style = doc.styles['Normal']
|
||
font = style.font
|
||
font.name = 'Calibri'
|
||
font.size = Pt(11)
|
||
except Exception as e:
|
||
self.logger.warning(f"Could not set up basic document styles: {str(e)}")
|
||
|
||
|
||
|
||
|
||
def _clearTemplateContent(self, doc: Document) -> None:
|
||
"""Clear template content while preserving styles."""
|
||
try:
|
||
# Remove all paragraphs except keep the styles
|
||
for paragraph in list(doc.paragraphs):
|
||
# Keep the paragraph but clear its content
|
||
paragraph.clear()
|
||
|
||
# Remove all tables
|
||
for table in list(doc.tables):
|
||
table._element.getparent().remove(table._element)
|
||
|
||
except Exception as e:
|
||
self.logger.warning(f"Could not clear template content: {str(e)}")
|
||
|
||
def _renderJsonSection(self, doc: Document, section: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||
"""Render a single JSON section to DOCX using AI-generated styles.
|
||
Supports three content formats: reference, object (base64), extracted_text.
|
||
"""
|
||
try:
|
||
section_type = section.get("content_type", "paragraph")
|
||
elements = section.get("elements", [])
|
||
|
||
# If no elements, skip this section (it has no content to render)
|
||
if not elements:
|
||
return
|
||
|
||
# Process each element in the section
|
||
for element in elements:
|
||
# Skip non-dict elements (e.g., int, str, etc.)
|
||
if not isinstance(element, dict):
|
||
continue
|
||
element_type = element.get("type", "")
|
||
|
||
# Support three content formats from Phase 5D
|
||
if element_type == "reference":
|
||
# Document reference format
|
||
doc_ref = element.get("documentReference", "")
|
||
label = element.get("label", "Reference")
|
||
para = doc.add_paragraph(f"[Reference: {label}]")
|
||
para.runs[0].italic = True
|
||
continue
|
||
elif element_type == "extracted_text":
|
||
content = element.get("content", "")
|
||
source = element.get("source", "")
|
||
if content:
|
||
para = doc.add_paragraph()
|
||
self._addMarkdownInlineRuns(para, content)
|
||
if source:
|
||
para.add_run(f" (Source: {source})").italic = True
|
||
continue
|
||
|
||
# Check element type, not section type (elements can have different types than section)
|
||
if element_type == "table":
|
||
self._renderJsonTable(doc, element, styles)
|
||
elif element_type == "bullet_list":
|
||
self._renderJsonBulletList(doc, element, styles)
|
||
elif element_type == "heading":
|
||
self._renderJsonHeading(doc, element, styles)
|
||
elif element_type == "paragraph":
|
||
self._renderJsonParagraph(doc, element, styles)
|
||
elif element_type == "code_block":
|
||
self._renderJsonCodeBlock(doc, element, styles)
|
||
elif element_type == "image":
|
||
self._renderJsonImage(doc, element, styles)
|
||
else:
|
||
# Fallback: if element_type not set, use section_type
|
||
if section_type == "table":
|
||
self._renderJsonTable(doc, element, styles)
|
||
elif section_type == "bullet_list":
|
||
self._renderJsonBulletList(doc, element, styles)
|
||
elif section_type == "heading":
|
||
self._renderJsonHeading(doc, element, styles)
|
||
elif section_type == "paragraph":
|
||
# CRITICAL: Check if this is actually an image element before rendering as paragraph
|
||
# Image elements might not have type set, but have base64Data in content
|
||
content = element.get("content", {})
|
||
if isinstance(content, dict) and content.get("base64Data"):
|
||
# This is actually an image, render it as such
|
||
self._renderJsonImage(doc, element, styles)
|
||
else:
|
||
self._renderJsonParagraph(doc, element, styles)
|
||
elif section_type == "code_block":
|
||
self._renderJsonCodeBlock(doc, element, styles)
|
||
elif section_type == "image":
|
||
self._renderJsonImage(doc, element, styles)
|
||
else:
|
||
# Fallback to paragraph for unknown types, but check for image data first
|
||
content = element.get("content", {})
|
||
if isinstance(content, dict) and content.get("base64Data"):
|
||
# This is actually an image, render it as such
|
||
self._renderJsonImage(doc, element, styles)
|
||
else:
|
||
self._renderJsonParagraph(doc, element, styles)
|
||
|
||
except Exception as e:
|
||
self.logger.warning(f"Error rendering section {section.get('id', 'unknown')}: {str(e)}")
|
||
# Add error paragraph as fallback
|
||
error_para = doc.add_paragraph(f"[Error rendering section: {str(e)}]")
|
||
|
||
# ── Markdown inline → python-docx runs ──────────────────────────────
|
||
_MD_INLINE_RE = re.compile(
|
||
r"(\*\*(.+?)\*\*)" # group 1,2: bold
|
||
r"|(__(.+?)__)" # group 3,4: bold (underscore)
|
||
r"|(?<!\*)\*([^*\n]+?)\*(?!\*)" # group 5: italic
|
||
r"|(?<![\w/])_([^_\n]+?)_(?![\w/])" # group 6: italic (underscore)
|
||
r"|`([^`]+)`" # group 7: inline code
|
||
)
|
||
|
||
def _addMarkdownInlineRuns(self, paragraph, text: str) -> None:
|
||
"""Parse markdown inline formatting and add corresponding Runs to a python-docx paragraph."""
|
||
pos = 0
|
||
for m in self._MD_INLINE_RE.finditer(text):
|
||
if m.start() > pos:
|
||
paragraph.add_run(text[pos:m.start()])
|
||
if m.group(2):
|
||
paragraph.add_run(m.group(2)).bold = True
|
||
elif m.group(4):
|
||
paragraph.add_run(m.group(4)).bold = True
|
||
elif m.group(5):
|
||
paragraph.add_run(m.group(5)).italic = True
|
||
elif m.group(6):
|
||
paragraph.add_run(m.group(6)).italic = True
|
||
elif m.group(7):
|
||
run = paragraph.add_run(m.group(7))
|
||
run.font.name = "Courier New"
|
||
run.font.size = Pt(9)
|
||
pos = m.end()
|
||
if pos < len(text):
|
||
paragraph.add_run(text[pos:])
|
||
|
||
def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||
"""
|
||
Render a JSON table to DOCX using AI-generated styles.
|
||
|
||
PERFORMANCE OPTIMIZATION: Uses direct XML manipulation via lxml instead of
|
||
python-docx high-level API. This bypasses the slow cell.text assignment
|
||
which creates multiple XML operations per cell.
|
||
|
||
The key insight: python-docx's cell.text setter is slow because it:
|
||
1. Clears existing content (XML manipulation)
|
||
2. Creates a new paragraph element
|
||
3. Creates a new run element
|
||
4. Sets text value
|
||
|
||
By building the XML directly, we achieve 100-1000x faster performance.
|
||
"""
|
||
import time
|
||
table_start = time.time()
|
||
try:
|
||
# Extract from nested content structure
|
||
content = table_data.get("content", {})
|
||
if not isinstance(content, dict):
|
||
return
|
||
headers = content.get("headers", [])
|
||
rows = content.get("rows", [])
|
||
|
||
if not headers or not rows:
|
||
return
|
||
|
||
totalRows = len(rows)
|
||
totalCols = len(headers)
|
||
totalCells = totalRows * totalCols
|
||
|
||
self.logger.debug(f"_renderJsonTable: Starting FAST table render - {totalRows} rows x {totalCols} columns = {totalCells} cells")
|
||
|
||
# Use fast XML-based table rendering
|
||
self._renderTableFastXml(doc, headers, rows, styles)
|
||
|
||
total_time = time.time() - table_start
|
||
rate = totalCells / total_time if total_time > 0 else 0
|
||
self.logger.info(f"_renderJsonTable: Table completed in {total_time:.2f}s ({totalRows} rows x {totalCols} cols = {totalCells} cells) - Rate: {rate:.0f} cells/s")
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"Error rendering table: {str(e)}", exc_info=True)
|
||
|
||
def _renderTableFastXml(self, doc: Document, headers: List[str], rows: List[List[Any]], styles: Dict[str, Any]) -> None:
|
||
"""
|
||
High-performance table rendering using direct XML manipulation.
|
||
|
||
This bypasses python-docx's slow high-level API and builds the table
|
||
XML structure directly using lxml, which is 100-1000x faster.
|
||
"""
|
||
import time
|
||
from docx.oxml.shared import OxmlElement, qn
|
||
from docx.oxml.ns import nsmap
|
||
from lxml import etree
|
||
|
||
create_start = time.time()
|
||
|
||
# Get the document body element
|
||
body = doc._body._body
|
||
|
||
# Create table element
|
||
tbl = OxmlElement('w:tbl')
|
||
|
||
# Add table properties
|
||
tblPr = OxmlElement('w:tblPr')
|
||
|
||
# Table width - auto
|
||
tblW = OxmlElement('w:tblW')
|
||
tblW.set(qn('w:type'), 'auto')
|
||
tblW.set(qn('w:w'), '0')
|
||
tblPr.append(tblW)
|
||
|
||
jc = OxmlElement('w:jc')
|
||
jc.set(qn('w:val'), 'left')
|
||
tblPr.append(jc)
|
||
|
||
# Apply table borders directly (works without template styles)
|
||
borderStyle = styles.get("table_border", {}).get("style", "grid")
|
||
tblBorders = self._createTableBordersXml(borderStyle)
|
||
tblPr.append(tblBorders)
|
||
|
||
# Table cell margins for better readability
|
||
tblCellMar = OxmlElement('w:tblCellMar')
|
||
for side in ['top', 'left', 'bottom', 'right']:
|
||
margin = OxmlElement(f'w:{side}')
|
||
margin.set(qn('w:w'), '80') # 80 twips = ~4pt padding
|
||
margin.set(qn('w:type'), 'dxa')
|
||
tblCellMar.append(margin)
|
||
tblPr.append(tblCellMar)
|
||
|
||
tbl.append(tblPr)
|
||
|
||
# Create table grid (column definitions)
|
||
tblGrid = OxmlElement('w:tblGrid')
|
||
for _ in range(len(headers)):
|
||
gridCol = OxmlElement('w:gridCol')
|
||
tblGrid.append(gridCol)
|
||
tbl.append(tblGrid)
|
||
|
||
self.logger.debug(f"_renderTableFastXml: Table structure created in {time.time() - create_start:.3f}s")
|
||
|
||
# Build all rows using fast XML
|
||
rows_start = time.time()
|
||
|
||
# Header row
|
||
headerRow = self._createTableRowXml(headers, isHeader=True)
|
||
tbl.append(headerRow)
|
||
|
||
header_time = time.time() - rows_start
|
||
self.logger.debug(f"_renderTableFastXml: Header row created in {header_time:.3f}s")
|
||
|
||
# Data rows - batch process for performance
|
||
data_start = time.time()
|
||
rowCount = len(rows)
|
||
|
||
for idx, rowData in enumerate(rows):
|
||
# Convert all cells to strings
|
||
cellTexts = [str(cell) if cell is not None else '' for cell in rowData]
|
||
# Pad if needed
|
||
while len(cellTexts) < len(headers):
|
||
cellTexts.append('')
|
||
|
||
row = self._createTableRowXml(cellTexts, isHeader=False)
|
||
tbl.append(row)
|
||
|
||
# Log progress every 10%
|
||
if rowCount > 100 and (idx + 1) % (rowCount // 10) == 0:
|
||
elapsed = time.time() - data_start
|
||
rate = (idx + 1) * len(headers) / elapsed if elapsed > 0 else 0
|
||
self.logger.debug(f"_renderTableFastXml: Progress {((idx + 1) / rowCount * 100):.0f}% ({idx + 1}/{rowCount} rows) - Rate: {rate:.0f} cells/s")
|
||
|
||
data_time = time.time() - data_start
|
||
|
||
# Append table to document body
|
||
body.append(tbl)
|
||
|
||
# Add an empty paragraph after the table to prevent Word from merging consecutive tables
|
||
separatorParagraph = OxmlElement('w:p')
|
||
body.append(separatorParagraph)
|
||
|
||
total_time = time.time() - create_start
|
||
totalCells = (rowCount + 1) * len(headers)
|
||
rate = totalCells / total_time if total_time > 0 else 0
|
||
|
||
self.logger.debug(f"_renderTableFastXml: All rows created in {data_time:.2f}s, total: {total_time:.2f}s, rate: {rate:.0f} cells/s")
|
||
|
||
def _createTableBordersXml(self, borderStyle: str) -> Any:
|
||
"""
|
||
Create table borders XML element based on style.
|
||
|
||
Supports:
|
||
- 'grid': Full grid with all borders (default)
|
||
- 'horizontal_only': Only horizontal lines between rows
|
||
- 'none' or other: Minimal/no borders
|
||
"""
|
||
from docx.oxml.shared import OxmlElement, qn
|
||
|
||
tblBorders = OxmlElement('w:tblBorders')
|
||
|
||
# Border color - dark gray for professional look
|
||
borderColor = '404040'
|
||
borderSize = '4' # 0.5pt (in eighths of a point)
|
||
|
||
if borderStyle == "grid":
|
||
# Full grid - all borders
|
||
for borderName in ['top', 'left', 'bottom', 'right', 'insideH', 'insideV']:
|
||
border = OxmlElement(f'w:{borderName}')
|
||
border.set(qn('w:val'), 'single')
|
||
border.set(qn('w:sz'), borderSize)
|
||
border.set(qn('w:space'), '0')
|
||
border.set(qn('w:color'), borderColor)
|
||
tblBorders.append(border)
|
||
|
||
elif borderStyle == "horizontal_only":
|
||
# Only horizontal lines
|
||
for borderName in ['top', 'bottom', 'insideH']:
|
||
border = OxmlElement(f'w:{borderName}')
|
||
border.set(qn('w:val'), 'single')
|
||
border.set(qn('w:sz'), borderSize)
|
||
border.set(qn('w:space'), '0')
|
||
border.set(qn('w:color'), borderColor)
|
||
tblBorders.append(border)
|
||
# No vertical borders
|
||
for borderName in ['left', 'right', 'insideV']:
|
||
border = OxmlElement(f'w:{borderName}')
|
||
border.set(qn('w:val'), 'nil')
|
||
tblBorders.append(border)
|
||
else:
|
||
# Minimal - just outer border
|
||
for borderName in ['top', 'left', 'bottom', 'right']:
|
||
border = OxmlElement(f'w:{borderName}')
|
||
border.set(qn('w:val'), 'single')
|
||
border.set(qn('w:sz'), borderSize)
|
||
border.set(qn('w:space'), '0')
|
||
border.set(qn('w:color'), borderColor)
|
||
tblBorders.append(border)
|
||
|
||
return tblBorders
|
||
|
||
def _createTableRowXml(self, cells: List[str], isHeader: bool = False) -> Any:
|
||
"""
|
||
Create a table row XML element with cells.
|
||
|
||
This is the core fast-path: builds the row XML directly without
|
||
going through python-docx's slow cell.text assignment.
|
||
"""
|
||
from docx.oxml.shared import OxmlElement, qn
|
||
|
||
tr = OxmlElement('w:tr')
|
||
|
||
# Row properties for header
|
||
if isHeader:
|
||
trPr = OxmlElement('w:trPr')
|
||
tblHeader = OxmlElement('w:tblHeader')
|
||
trPr.append(tblHeader)
|
||
tr.append(trPr)
|
||
|
||
for cellText in cells:
|
||
# Create cell
|
||
tc = OxmlElement('w:tc')
|
||
|
||
# Cell properties
|
||
tcPr = OxmlElement('w:tcPr')
|
||
tcW = OxmlElement('w:tcW')
|
||
tcW.set(qn('w:type'), 'auto')
|
||
tcW.set(qn('w:w'), '0')
|
||
tcPr.append(tcW)
|
||
|
||
# Header cell styling - light blue background
|
||
if isHeader:
|
||
shd = OxmlElement('w:shd')
|
||
shd.set(qn('w:val'), 'clear')
|
||
shd.set(qn('w:color'), 'auto')
|
||
shd.set(qn('w:fill'), '4472C4') # Professional blue
|
||
tcPr.append(shd)
|
||
|
||
tc.append(tcPr)
|
||
|
||
# Paragraph with text
|
||
p = OxmlElement('w:p')
|
||
|
||
# Add run with text
|
||
r = OxmlElement('w:r')
|
||
|
||
# Header text styling - bold and white
|
||
if isHeader:
|
||
rPr = OxmlElement('w:rPr')
|
||
b = OxmlElement('w:b')
|
||
rPr.append(b)
|
||
# White text color
|
||
color = OxmlElement('w:color')
|
||
color.set(qn('w:val'), 'FFFFFF')
|
||
rPr.append(color)
|
||
r.append(rPr)
|
||
|
||
# Text element
|
||
t = OxmlElement('w:t')
|
||
# Preserve spaces if text starts/ends with whitespace
|
||
if cellText and (cellText[0] == ' ' or cellText[-1] == ' '):
|
||
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
||
t.text = cellText
|
||
r.append(t)
|
||
|
||
p.append(r)
|
||
tc.append(p)
|
||
tr.append(tc)
|
||
|
||
return tr
|
||
|
||
def _applyHorizontalBordersOnly(self, table) -> None:
|
||
"""Apply only horizontal borders to the table (no vertical borders)."""
|
||
try:
|
||
from docx.oxml.shared import OxmlElement, qn
|
||
|
||
# Get table properties
|
||
tbl_pr = table._element.find(qn('w:tblPr'))
|
||
if tbl_pr is None:
|
||
tbl_pr = OxmlElement('w:tblPr')
|
||
table._element.insert(0, tbl_pr)
|
||
|
||
# Remove existing borders
|
||
existing_borders = tbl_pr.find(qn('w:tblBorders'))
|
||
if existing_borders is not None:
|
||
tbl_pr.remove(existing_borders)
|
||
|
||
# Create new borders element
|
||
tbl_borders = OxmlElement('w:tblBorders')
|
||
|
||
# Top border
|
||
top_border = OxmlElement('w:top')
|
||
top_border.set(qn('w:val'), 'single')
|
||
top_border.set(qn('w:sz'), '4')
|
||
top_border.set(qn('w:space'), '0')
|
||
top_border.set(qn('w:color'), '000000')
|
||
tbl_borders.append(top_border)
|
||
|
||
# Bottom border
|
||
bottom_border = OxmlElement('w:bottom')
|
||
bottom_border.set(qn('w:val'), 'single')
|
||
bottom_border.set(qn('w:sz'), '4')
|
||
bottom_border.set(qn('w:space'), '0')
|
||
bottom_border.set(qn('w:color'), '000000')
|
||
tbl_borders.append(bottom_border)
|
||
|
||
# Left border - none
|
||
left_border = OxmlElement('w:left')
|
||
left_border.set(qn('w:val'), 'none')
|
||
tbl_borders.append(left_border)
|
||
|
||
# Right border - none
|
||
right_border = OxmlElement('w:right')
|
||
right_border.set(qn('w:val'), 'none')
|
||
tbl_borders.append(right_border)
|
||
|
||
# Inside horizontal border
|
||
inside_h_border = OxmlElement('w:insideH')
|
||
inside_h_border.set(qn('w:val'), 'single')
|
||
inside_h_border.set(qn('w:sz'), '4')
|
||
inside_h_border.set(qn('w:space'), '0')
|
||
inside_h_border.set(qn('w:color'), '000000')
|
||
tbl_borders.append(inside_h_border)
|
||
|
||
# Inside vertical border - none
|
||
inside_v_border = OxmlElement('w:insideV')
|
||
inside_v_border.set(qn('w:val'), 'none')
|
||
tbl_borders.append(inside_v_border)
|
||
|
||
tbl_pr.append(tbl_borders)
|
||
|
||
except Exception as e:
|
||
self.logger.warning(f"Could not apply horizontal borders: {str(e)}")
|
||
|
||
def _setCellBackground(self, cell, color: RGBColor) -> None:
|
||
"""Set the background color of a table cell."""
|
||
try:
|
||
from docx.oxml.shared import OxmlElement, qn
|
||
|
||
# Get cell properties
|
||
tc_pr = cell._element.find(qn('w:tcPr'))
|
||
if tc_pr is None:
|
||
tc_pr = OxmlElement('w:tcPr')
|
||
cell._element.insert(0, tc_pr)
|
||
|
||
# Remove existing shading
|
||
existing_shading = tc_pr.find(qn('w:shd'))
|
||
if existing_shading is not None:
|
||
tc_pr.remove(existing_shading)
|
||
|
||
# Create new shading element
|
||
shading = OxmlElement('w:shd')
|
||
shading.set(qn('w:val'), 'clear')
|
||
shading.set(qn('w:color'), 'auto')
|
||
# Convert RGBColor to hex string by unpacking RGB components
|
||
red, green, blue = color
|
||
hex_color = f"{red:02x}{green:02x}{blue:02x}"
|
||
shading.set(qn('w:fill'), hex_color)
|
||
tc_pr.append(shading)
|
||
|
||
except Exception as e:
|
||
self.logger.warning(f"Could not set cell background: {str(e)}")
|
||
|
||
def _setCellBackgroundFast(self, cell, hex_color: str) -> None:
|
||
"""
|
||
Set the background color of a table cell using pre-calculated hex string.
|
||
PERFORMANCE OPTIMIZED: Avoids RGBColor unpacking and string formatting in hot loop.
|
||
"""
|
||
try:
|
||
from docx.oxml.shared import OxmlElement, qn
|
||
|
||
# Get cell properties
|
||
tc_pr = cell._element.find(qn('w:tcPr'))
|
||
if tc_pr is None:
|
||
tc_pr = OxmlElement('w:tcPr')
|
||
cell._element.insert(0, tc_pr)
|
||
|
||
# Remove existing shading
|
||
existing_shading = tc_pr.find(qn('w:shd'))
|
||
if existing_shading is not None:
|
||
tc_pr.remove(existing_shading)
|
||
|
||
# Create new shading element with pre-calculated hex color
|
||
shading = OxmlElement('w:shd')
|
||
shading.set(qn('w:val'), 'clear')
|
||
shading.set(qn('w:color'), 'auto')
|
||
shading.set(qn('w:fill'), hex_color)
|
||
tc_pr.append(shading)
|
||
|
||
except Exception as e:
|
||
self.logger.warning(f"Could not set cell background: {str(e)}")
|
||
|
||
|
||
def _renderJsonBulletList(self, doc: Document, list_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||
"""Render a JSON bullet list to DOCX using AI-generated styles - OPTIMIZED for performance."""
|
||
try:
|
||
# Extract from nested content structure
|
||
content = list_data.get("content", {})
|
||
if not isinstance(content, dict):
|
||
return
|
||
items = content.get("items", [])
|
||
bullet_style = styles.get("bullet_list", {})
|
||
|
||
# Pre-calculate and cache style objects to avoid repeated parsing
|
||
font_size_pt = None
|
||
text_color_rgb = None
|
||
if bullet_style:
|
||
if "font_size" in bullet_style:
|
||
font_size_pt = Pt(bullet_style["font_size"])
|
||
if "color" in bullet_style:
|
||
color_hex = bullet_style["color"].lstrip('#')
|
||
text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
|
||
|
||
for item in items:
|
||
itemText = item if isinstance(item, str) else (item.get("text", "") if isinstance(item, dict) else "")
|
||
if not itemText:
|
||
continue
|
||
para = doc.add_paragraph(style='List Bullet')
|
||
self._addMarkdownInlineRuns(para, itemText)
|
||
|
||
# Apply bullet list styling from style set - use cached objects
|
||
if bullet_style and para.runs:
|
||
# Use direct access instead of iterating
|
||
if len(para.runs) > 0:
|
||
run = para.runs[0]
|
||
if font_size_pt:
|
||
run.font.size = font_size_pt
|
||
if text_color_rgb:
|
||
run.font.color.rgb = text_color_rgb
|
||
else:
|
||
# Create run if none exists
|
||
run = para.add_run()
|
||
if font_size_pt:
|
||
run.font.size = font_size_pt
|
||
if text_color_rgb:
|
||
run.font.color.rgb = text_color_rgb
|
||
|
||
except Exception as e:
|
||
self.logger.warning(f"Error rendering bullet list: {str(e)}")
|
||
|
||
def _renderJsonHeading(self, doc: Document, heading_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||
"""Render a JSON heading to DOCX using AI-generated styles."""
|
||
try:
|
||
content = heading_data.get("content", {})
|
||
if not isinstance(content, dict):
|
||
return
|
||
text = content.get("text", "")
|
||
level = content.get("level", 1)
|
||
|
||
if text:
|
||
level = max(1, min(6, level))
|
||
# python-docx supports Heading 1 – Heading 9 as built-in styles
|
||
try:
|
||
para = doc.add_heading("", level=level)
|
||
para.clear()
|
||
self._addMarkdownInlineRuns(para, text)
|
||
except (KeyError, ValueError):
|
||
para = doc.add_paragraph(text)
|
||
|
||
except Exception as e:
|
||
self.logger.warning(f"Error rendering heading: {str(e)}")
|
||
|
||
def _renderJsonParagraph(self, doc: Document, paragraph_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||
"""Render a JSON paragraph to DOCX using AI-generated styles."""
|
||
try:
|
||
# Extract from nested content structure
|
||
content = paragraph_data.get("content", {})
|
||
if isinstance(content, dict):
|
||
text = content.get("text", "")
|
||
elif isinstance(content, str):
|
||
text = content
|
||
else:
|
||
text = ""
|
||
|
||
# CRITICAL: Prevent rendering base64 image data as text
|
||
# Base64 image data typically starts with /9j/ (JPEG) or iVBORw0KGgo (PNG)
|
||
if text and (text.startswith("/9j/") or text.startswith("iVBORw0KGgo") or
|
||
(len(text) > 100 and all(c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in text[:100]))):
|
||
# This looks like base64 data - don't render as text
|
||
self.logger.warning(f"Skipping rendering of what appears to be base64 data in paragraph (length: {len(text)})")
|
||
para = doc.add_paragraph("[Error: Image data found in text content - image embedding may have failed]")
|
||
if para.runs:
|
||
para.runs[0].font.color.rgb = RGBColor(255, 0, 0) # Red color for error
|
||
return
|
||
|
||
if text:
|
||
para = doc.add_paragraph()
|
||
self._addMarkdownInlineRuns(para, text)
|
||
paragraph_style = styles.get("paragraph", {})
|
||
if paragraph_style:
|
||
# Pre-calculate and cache style objects
|
||
font_size_pt = None
|
||
text_color_rgb = None
|
||
if "font_size" in paragraph_style:
|
||
font_size_pt = Pt(paragraph_style["font_size"])
|
||
if "color" in paragraph_style:
|
||
color_hex = paragraph_style["color"].lstrip('#')
|
||
text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
|
||
bold = paragraph_style.get("bold", False)
|
||
|
||
# Use direct access instead of iterating
|
||
if len(para.runs) > 0:
|
||
run = para.runs[0]
|
||
if font_size_pt:
|
||
run.font.size = font_size_pt
|
||
run.font.bold = bold
|
||
if text_color_rgb:
|
||
run.font.color.rgb = text_color_rgb
|
||
else:
|
||
# Create run if none exists
|
||
run = para.add_run()
|
||
if font_size_pt:
|
||
run.font.size = font_size_pt
|
||
run.font.bold = bold
|
||
if text_color_rgb:
|
||
run.font.color.rgb = text_color_rgb
|
||
|
||
if "align" in paragraph_style:
|
||
align = paragraph_style["align"]
|
||
if align == "center":
|
||
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
elif align == "right":
|
||
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
|
||
else:
|
||
para.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
|
||
except Exception as e:
|
||
self.logger.warning(f"Error rendering paragraph: {str(e)}")
|
||
|
||
def _renderJsonCodeBlock(self, doc: Document, code_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||
"""Render a JSON code block to DOCX using AI-generated styles."""
|
||
try:
|
||
# Extract from nested content structure
|
||
content = code_data.get("content", {})
|
||
if not isinstance(content, dict):
|
||
return
|
||
code = content.get("code", "")
|
||
language = content.get("language", "")
|
||
code_style = styles.get("code_block", {})
|
||
|
||
if code:
|
||
if language:
|
||
lang_para = doc.add_paragraph(f"Code ({language}):")
|
||
if len(lang_para.runs) > 0:
|
||
lang_para.runs[0].bold = True
|
||
|
||
# Pre-calculate and cache style objects
|
||
code_font_name = code_style.get("font", "Courier New")
|
||
code_font_size_pt = Pt(code_style.get("font_size", 9))
|
||
code_text_color_rgb = None
|
||
if "color" in code_style:
|
||
color_hex = code_style["color"].lstrip('#')
|
||
code_text_color_rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
|
||
|
||
code_para = doc.add_paragraph(code)
|
||
# Use direct access instead of iterating
|
||
if len(code_para.runs) > 0:
|
||
run = code_para.runs[0]
|
||
run.font.name = code_font_name
|
||
run.font.size = code_font_size_pt
|
||
if code_text_color_rgb:
|
||
run.font.color.rgb = code_text_color_rgb
|
||
else:
|
||
# Create run if none exists
|
||
run = code_para.add_run()
|
||
run.font.name = code_font_name
|
||
run.font.size = code_font_size_pt
|
||
if code_text_color_rgb:
|
||
run.font.color.rgb = code_text_color_rgb
|
||
|
||
except Exception as e:
|
||
self.logger.warning(f"Error rendering code block: {str(e)}")
|
||
|
||
def _renderJsonImage(self, doc: Document, image_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
|
||
"""Render a JSON image to DOCX."""
|
||
try:
|
||
# Extract from nested content structure
|
||
content = image_data.get("content", {})
|
||
base64_data = ""
|
||
alt_text = "Image"
|
||
caption = ""
|
||
|
||
if isinstance(content, dict):
|
||
base64_data = content.get("base64Data", "")
|
||
alt_text = content.get("altText", "Image")
|
||
caption = content.get("caption", "")
|
||
elif isinstance(content, str):
|
||
# Content might be base64 string directly (shouldn't happen, but handle it)
|
||
self.logger.warning("Image content is a string, not a dict. This should not happen.")
|
||
return
|
||
|
||
# If base64Data not found in content, try direct element fields (fallback)
|
||
if not base64_data:
|
||
base64_data = image_data.get("base64Data", "")
|
||
if not alt_text or alt_text == "Image":
|
||
alt_text = image_data.get("altText", "Image")
|
||
if not caption:
|
||
caption = image_data.get("caption", "")
|
||
|
||
# CRITICAL: Ensure we don't render base64 data as text
|
||
# If base64_data looks like it might be rendered elsewhere, skip it
|
||
if not base64_data:
|
||
raise Exception("No image data provided (base64Data is empty)")
|
||
|
||
try:
|
||
image_bytes = base64.b64decode(base64_data)
|
||
image_stream = io.BytesIO(image_bytes)
|
||
|
||
# Get image dimensions to calculate proper size
|
||
try:
|
||
from PIL import Image as PILImage
|
||
pil_image = PILImage.open(image_stream)
|
||
img_width_px, img_height_px = pil_image.size
|
||
|
||
# DOCX page width is typically 8.5 inches, usable width ~6.5 inches with margins
|
||
# Standard margins: 1 inch left/right, so usable width = 6.5 inches
|
||
max_width_inches = 6.5
|
||
max_height_inches = 9.0 # Leave room for text above/below
|
||
|
||
# Calculate scale factor to fit within page dimensions
|
||
# Convert pixels to inches (assuming 96 DPI for modern displays, but images may vary)
|
||
# Use conservative estimate: 1 inch = 96 pixels
|
||
img_width_inches = img_width_px / 96.0
|
||
img_height_inches = img_height_px / 96.0
|
||
|
||
# Calculate scale to fit
|
||
width_scale = max_width_inches / img_width_inches if img_width_inches > max_width_inches else 1.0
|
||
height_scale = max_height_inches / img_height_inches if img_height_inches > max_height_inches else 1.0
|
||
scale = min(width_scale, height_scale, 1.0) # Don't scale up, only down
|
||
|
||
final_width = img_width_inches * scale
|
||
final_height = img_height_inches * scale
|
||
|
||
# Reset stream for docx
|
||
image_stream.seek(0)
|
||
doc.add_picture(image_stream, width=Inches(final_width))
|
||
except Exception:
|
||
# Fallback: use conservative default size if PIL fails
|
||
image_stream.seek(0)
|
||
doc.add_picture(image_stream, width=Inches(6.0))
|
||
|
||
# Use caption from section if available, otherwise use alt_text
|
||
if caption:
|
||
caption_text = caption
|
||
elif alt_text and alt_text != "Image":
|
||
# Only use alt_text if it doesn't look like a usageHint
|
||
if "Render as visual element:" in alt_text:
|
||
# Extract filename from usageHint if possible
|
||
parts = alt_text.split("Render as visual element:")
|
||
if len(parts) > 1:
|
||
filename = parts[1].strip()
|
||
caption_text = f"Figure: {filename}"
|
||
else:
|
||
caption_text = alt_text
|
||
else:
|
||
caption_text = f"Figure: {alt_text}"
|
||
else:
|
||
caption_text = None
|
||
|
||
if caption_text:
|
||
caption_para = doc.add_paragraph(caption_text)
|
||
caption_para.runs[0].italic = True
|
||
except Exception as embedError:
|
||
# Image decoding or embedding failed
|
||
raise Exception(f"Failed to decode or embed image: {str(embedError)}")
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"Error embedding image in DOCX: {str(e)}")
|
||
errorMsg = f"[Error: Could not embed image '{image_data.get('altText', 'Image')}'. {str(e)}]"
|
||
errorPara = doc.add_paragraph(errorMsg)
|
||
if errorPara.runs:
|
||
errorPara.runs[0].font.color.rgb = RGBColor(255, 0, 0) # Red color for error
|
||
|
||
def _extractStructureFromPrompt(self, userPrompt: str, title: str) -> Dict[str, Any]:
|
||
"""Extract document structure from user prompt."""
|
||
structure = {
|
||
'title': title,
|
||
'sections': [],
|
||
'format': 'standard'
|
||
}
|
||
|
||
if not userPrompt:
|
||
return structure
|
||
|
||
# Extract title from prompt if not provided
|
||
if not title or title == "Generated Document":
|
||
# Look for "create a ... document" or "generate a ... report"
|
||
title_match = re.search(r'(?:create|generate|make)\s+a\s+([^,]+?)(?:\s+document|\s+report|\s+summary)', userPrompt.lower())
|
||
if title_match:
|
||
structure['title'] = title_match.group(1).strip().title()
|
||
|
||
# Extract sections from numbered lists in prompt
|
||
section_pattern = r'(\d+)\)?\s*([^,]+?)(?:\s*[,:]|\s*$)'
|
||
sections = re.findall(section_pattern, userPrompt)
|
||
|
||
for num, section_text in sections:
|
||
structure['sections'].append({
|
||
'number': int(num),
|
||
'title': section_text.strip(),
|
||
'level': 2 # H2 level
|
||
})
|
||
|
||
# If no numbered sections found, try to extract from "including:" patterns
|
||
if not structure['sections']:
|
||
including_match = re.search(r'including:\s*(.+?)(?:\.|$)', userPrompt, re.DOTALL)
|
||
if including_match:
|
||
including_text = including_match.group(1)
|
||
# Split by common separators
|
||
parts = re.split(r'[,;]\s*', including_text)
|
||
for i, part in enumerate(parts, 1):
|
||
part = part.strip()
|
||
if part:
|
||
structure['sections'].append({
|
||
'number': i,
|
||
'title': part,
|
||
'level': 2
|
||
})
|
||
|
||
# If still no sections, extract from any list-like patterns
|
||
if not structure['sections']:
|
||
# Look for bullet points or dashes
|
||
bullet_pattern = r'[-•]\s*([^,\n]+?)(?:\s*[,:]|\s*$)'
|
||
bullets = re.findall(bullet_pattern, userPrompt)
|
||
for i, bullet in enumerate(bullets, 1):
|
||
bullet = bullet.strip()
|
||
if bullet and len(bullet) > 3:
|
||
structure['sections'].append({
|
||
'number': i,
|
||
'title': bullet,
|
||
'level': 2
|
||
})
|
||
|
||
# If still no sections, extract from sentence structure
|
||
if not structure['sections']:
|
||
# Split prompt into sentences and use as sections
|
||
sentences = re.split(r'[.!?]\s+', userPrompt)
|
||
for i, sentence in enumerate(sentences[:5], 1): # Max 5 sections
|
||
sentence = sentence.strip()
|
||
if sentence and len(sentence) > 10 and not sentence.startswith(('Analyze', 'Create', 'Generate')):
|
||
structure['sections'].append({
|
||
'number': i,
|
||
'title': sentence[:50] + "..." if len(sentence) > 50 else sentence,
|
||
'level': 2
|
||
})
|
||
|
||
# Final fallback: create sections from prompt keywords
|
||
if not structure['sections']:
|
||
# Extract key action words from prompt
|
||
action_words = ['analyze', 'summarize', 'review', 'assess', 'evaluate', 'examine', 'investigate']
|
||
found_actions = []
|
||
for action in action_words:
|
||
if action in userPrompt.lower():
|
||
found_actions.append(action.title())
|
||
|
||
if found_actions:
|
||
for i, action in enumerate(found_actions[:3], 1):
|
||
structure['sections'].append({
|
||
'number': i,
|
||
'title': f"{action} Document Content",
|
||
'level': 2
|
||
})
|
||
else:
|
||
# Last resort: generic but meaningful sections
|
||
structure['sections'] = [
|
||
{'number': 1, 'title': 'Document Analysis', 'level': 2},
|
||
{'number': 2, 'title': 'Key Information', 'level': 2},
|
||
{'number': 3, 'title': 'Summary and Conclusions', 'level': 2}
|
||
]
|
||
|
||
return structure
|
||
|
||
def _generateFromStructure(self, doc, content: str, structure: Dict[str, Any]):
|
||
"""Generate DOCX content based on extracted structure."""
|
||
# Add sections based on prompt structure
|
||
for section in structure['sections']:
|
||
# Add section heading
|
||
doc.add_heading(f"{section['number']}) {section['title']}", level=section['level'])
|
||
|
||
# Add AI-generated content for this section
|
||
# Try to extract relevant content for this section from the AI response
|
||
section_content = self._extractSectionContent(content, section['title'])
|
||
|
||
if section_content:
|
||
doc.add_paragraph(section_content)
|
||
else:
|
||
# If no specific content found, add a note
|
||
doc.add_paragraph(f"Content for {section['title']} based on document analysis.")
|
||
|
||
# Add some spacing
|
||
doc.add_paragraph()
|
||
|
||
# Add the complete AI-generated content as additional analysis
|
||
if content and content.strip():
|
||
doc.add_heading("Complete Analysis", level=1)
|
||
doc.add_paragraph(content)
|
||
|
||
def _extractSectionContent(self, content: str, section_title: str) -> str:
|
||
"""Extract relevant content for a specific section from AI response."""
|
||
if not content or not section_title:
|
||
return ""
|
||
|
||
# Look for content that matches the section title
|
||
section_keywords = section_title.lower().split()
|
||
|
||
# Split content into paragraphs
|
||
paragraphs = content.split('\n\n')
|
||
|
||
relevant_paragraphs = []
|
||
for paragraph in paragraphs:
|
||
paragraph_lower = paragraph.lower()
|
||
# Check if paragraph contains keywords from section title
|
||
if any(keyword in paragraph_lower for keyword in section_keywords if len(keyword) > 3):
|
||
relevant_paragraphs.append(paragraph.strip())
|
||
|
||
if relevant_paragraphs:
|
||
return '\n\n'.join(relevant_paragraphs[:2]) # Max 2 paragraphs per section
|
||
|
||
return ""
|
||
|
||
def _setupDocumentStyles(self, doc: Document, styleSet: Dict[str, Any]) -> None:
|
||
"""Create all styles in document from style set.
|
||
|
||
Creates styles BEFORE rendering so they're available for use.
|
||
"""
|
||
try:
|
||
from docx.enum.style import WD_STYLE_TYPE
|
||
|
||
# Create Title style
|
||
if "title" in styleSet:
|
||
self._createStyle(doc, "Title", styleSet["title"], WD_STYLE_TYPE.PARAGRAPH)
|
||
|
||
# Create Heading styles (Heading 1, Heading 2)
|
||
if "heading1" in styleSet:
|
||
self._createStyle(doc, "Heading 1", styleSet["heading1"], WD_STYLE_TYPE.PARAGRAPH)
|
||
if "heading2" in styleSet:
|
||
self._createStyle(doc, "Heading 2", styleSet["heading2"], WD_STYLE_TYPE.PARAGRAPH)
|
||
|
||
# Create Paragraph style
|
||
if "paragraph" in styleSet:
|
||
self._createStyle(doc, "Custom Paragraph", styleSet["paragraph"], WD_STYLE_TYPE.PARAGRAPH)
|
||
|
||
# Note: List Bullet and List Number are built-in Word styles, but we apply custom styling to runs
|
||
|
||
except Exception as e:
|
||
self.logger.warning(f"Could not set up document styles: {str(e)}")
|
||
|
||
def _createStyle(self, doc: Document, styleName: str, styleConfig: Dict[str, Any], styleType) -> None:
|
||
"""Create or update a style in the document styles collection."""
|
||
try:
|
||
from docx.enum.style import WD_STYLE_TYPE
|
||
|
||
# Try to get existing style, or create new one
|
||
try:
|
||
doc_style = doc.styles[styleName]
|
||
except KeyError:
|
||
# Create new style based on Normal
|
||
doc_style = doc.styles.add_style(styleName, styleType)
|
||
# Base it on Normal style
|
||
doc_style.base_style = doc.styles['Normal']
|
||
|
||
# Apply font configuration
|
||
font = doc_style.font
|
||
if "font_size" in styleConfig:
|
||
font.size = Pt(styleConfig["font_size"])
|
||
if "bold" in styleConfig:
|
||
font.bold = styleConfig["bold"]
|
||
if "color" in styleConfig:
|
||
color_hex = styleConfig["color"].lstrip('#')
|
||
font.color.rgb = RGBColor(int(color_hex[0:2], 16), int(color_hex[2:4], 16), int(color_hex[4:6], 16))
|
||
if "font" in styleConfig:
|
||
font.name = styleConfig["font"]
|
||
|
||
# Set paragraph formatting for alignment
|
||
if "align" in styleConfig:
|
||
para_format = doc_style.paragraph_format
|
||
align = styleConfig["align"]
|
||
if align == "center":
|
||
para_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
elif align == "right":
|
||
para_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
|
||
else:
|
||
para_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
|
||
except Exception as e:
|
||
self.logger.warning(f"Could not create style '{styleName}': {str(e)}")
|
||
|
||
def _processSection(self, doc, lines: list):
|
||
"""Process a section of content into DOCX elements."""
|
||
for line in lines:
|
||
if not line.strip():
|
||
continue
|
||
|
||
# Check for tables (lines with |)
|
||
if '|' in line and not line.startswith('|'):
|
||
# This might be part of a table, process as table
|
||
table_data = self._extractTableData(lines)
|
||
if table_data:
|
||
self._addTable(doc, table_data)
|
||
return
|
||
|
||
# Check for lists
|
||
if line.startswith('- ') or line.startswith('* '):
|
||
# This is a list item
|
||
doc.add_paragraph(line[2:], style='List Bullet')
|
||
elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
|
||
# This is a numbered list item
|
||
doc.add_paragraph(line[3:], style='List Number')
|
||
else:
|
||
# Regular paragraph
|
||
doc.add_paragraph(line)
|
||
|
||
def _extractTableData(self, lines: list) -> list:
|
||
"""Extract table data from lines."""
|
||
table_data = []
|
||
in_table = False
|
||
|
||
for line in lines:
|
||
if '|' in line:
|
||
if not in_table:
|
||
in_table = True
|
||
# Split by | and clean up
|
||
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
|
||
if cells:
|
||
table_data.append(cells)
|
||
elif in_table and not line.strip():
|
||
# Empty line, might be end of table
|
||
break
|
||
|
||
return table_data if len(table_data) > 1 else []
|
||
|
||
def _addTable(self, doc, table_data: list):
|
||
"""Add a table to the document."""
|
||
try:
|
||
if not table_data:
|
||
return
|
||
|
||
# Create table
|
||
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
|
||
table.alignment = WD_TABLE_ALIGNMENT.LEFT
|
||
|
||
# Add data to table
|
||
for row_idx, row_data in enumerate(table_data):
|
||
for col_idx, cell_data in enumerate(row_data):
|
||
if col_idx < len(table.rows[row_idx].cells):
|
||
table.rows[row_idx].cells[col_idx].text = cell_data
|
||
|
||
# Style the table
|
||
self._styleTable(table)
|
||
|
||
# Add an empty paragraph after the table to prevent Word from merging consecutive tables
|
||
doc.add_paragraph()
|
||
|
||
except Exception as e:
|
||
self.logger.warning(f"Could not add table: {str(e)}")
|
||
|
||
def _styleTable(self, table):
|
||
"""Apply styling to the table."""
|
||
try:
|
||
# Style header row
|
||
if len(table.rows) > 0:
|
||
header_cells = table.rows[0].cells
|
||
for cell in header_cells:
|
||
for paragraph in cell.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.bold = True
|
||
except Exception as e:
|
||
self.logger.warning(f"Could not style table: {str(e)}")
|
||
|
||
def _processTableRow(self, doc, line: str):
|
||
"""Process a table row and add it to the document."""
|
||
if not line.strip():
|
||
return
|
||
|
||
# Split by pipe separator
|
||
parts = [part.strip() for part in line.split('|')]
|
||
|
||
if len(parts) >= 2:
|
||
# This is a table row - create a table if it doesn't exist
|
||
if not hasattr(self, '_current_table') or self._current_table is None:
|
||
# Create new table
|
||
self._current_table = doc.add_table(rows=1, cols=len(parts))
|
||
self._current_table.style = 'Table Grid'
|
||
|
||
# Add header row
|
||
for i, part in enumerate(parts):
|
||
if i < len(self._current_table.rows[0].cells):
|
||
cell = self._current_table.rows[0].cells[i]
|
||
cell.text = part
|
||
# Make header bold
|
||
for paragraph in cell.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.bold = True
|
||
else:
|
||
# Add data row to existing table
|
||
row = self._current_table.add_row()
|
||
for i, part in enumerate(parts):
|
||
if i < len(row.cells):
|
||
row.cells[i].text = part
|
||
else:
|
||
# Not a table row, treat as regular text
|
||
doc.add_paragraph(line)
|
||
|
||
def _cleanAiContent(self, content: str) -> str:
|
||
"""Clean AI-generated content by removing debug information and duplicates."""
|
||
if not content:
|
||
return ""
|
||
|
||
# Remove debug information
|
||
lines = content.split('\n')
|
||
clean_lines = []
|
||
|
||
for line in lines:
|
||
# Skip debug lines and separators
|
||
if (line.startswith('[Skipped ') or
|
||
line.startswith('=== DOCUMENT:') or
|
||
line.startswith('---') or
|
||
line.startswith('FILENAME:') or
|
||
line.strip() == '' or
|
||
line.strip() == '---'):
|
||
continue
|
||
clean_lines.append(line)
|
||
|
||
# Join lines and remove duplicate content
|
||
clean_content = '\n'.join(clean_lines)
|
||
|
||
# Remove duplicate sections by keeping only the first occurrence
|
||
sections = clean_content.split('\n\n')
|
||
seen_sections = set()
|
||
unique_sections = []
|
||
|
||
for section in sections:
|
||
section_key = section.strip()[:50] # Use first 50 chars as key
|
||
if section_key not in seen_sections and section.strip():
|
||
seen_sections.add(section_key)
|
||
unique_sections.append(section)
|
||
|
||
return '\n\n'.join(unique_sections)
|
||
|
||
def _processTables(self, doc, content: str) -> str:
|
||
"""
|
||
Process tables in the content (both CSV and pipe-separated) and convert them to Word tables.
|
||
Returns the content with tables replaced by placeholders.
|
||
"""
|
||
# csv is already imported at module level
|
||
|
||
lines = content.split('\n')
|
||
processed_lines = []
|
||
i = 0
|
||
|
||
while i < len(lines):
|
||
line = lines[i].strip()
|
||
|
||
# Check if this line looks like a table (contains pipes or commas with multiple fields)
|
||
is_pipe_table = '|' in line and len(line.split('|')) >= 2
|
||
is_csv_table = ',' in line and len(line.split(',')) >= 2
|
||
|
||
if is_pipe_table or is_csv_table:
|
||
# Collect consecutive table lines
|
||
table_lines = []
|
||
j = i
|
||
|
||
# Determine separator and collect lines
|
||
separator = '|' if is_pipe_table else ','
|
||
while j < len(lines):
|
||
current_line = lines[j].strip()
|
||
if separator in current_line and len(current_line.split(separator)) >= 2:
|
||
table_lines.append(current_line)
|
||
j += 1
|
||
else:
|
||
break
|
||
|
||
if len(table_lines) >= 2: # At least header + 1 data row
|
||
# Create Word table
|
||
try:
|
||
if separator == '|':
|
||
# Process pipe-separated table
|
||
rows = []
|
||
for table_line in table_lines:
|
||
# Split by pipe and clean up
|
||
cells = [cell.strip() for cell in table_line.split('|')]
|
||
rows.append(cells)
|
||
else:
|
||
# Process CSV table
|
||
csv_content = '\n'.join(table_lines)
|
||
csv_reader = csv.reader(io.StringIO(csv_content))
|
||
rows = list(csv_reader)
|
||
|
||
if rows and len(rows[0]) > 0:
|
||
# Create Word table
|
||
table = doc.add_table(rows=len(rows), cols=len(rows[0]))
|
||
table.style = 'Table Grid'
|
||
|
||
# Populate table
|
||
for row_idx, row_data in enumerate(rows):
|
||
for col_idx, cell_data in enumerate(row_data):
|
||
if col_idx < len(table.rows[row_idx].cells):
|
||
table.rows[row_idx].cells[col_idx].text = cell_data.strip()
|
||
|
||
# Make header row bold
|
||
if row_idx == 0:
|
||
for cell in table.rows[row_idx].cells:
|
||
for paragraph in cell.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.bold = True
|
||
|
||
# Add an empty paragraph after the table to prevent Word from merging consecutive tables
|
||
doc.add_paragraph()
|
||
|
||
# Add placeholder to mark where table was inserted
|
||
processed_lines.append(f"[TABLE_INSERTED_{len(processed_lines)}]")
|
||
|
||
# Skip the table lines
|
||
i = j
|
||
continue
|
||
except Exception as e:
|
||
# If table parsing fails, treat as regular text
|
||
pass
|
||
|
||
processed_lines.append(line)
|
||
i += 1
|
||
|
||
return '\n'.join(processed_lines)
|
||
|
||
def _parseAndFormatContent(self, doc, content: str, title: str):
|
||
"""Parse AI-generated content in standardized format and apply proper DOCX formatting."""
|
||
if not content:
|
||
return
|
||
|
||
# Process tables and replace them with placeholders
|
||
content = self._processTables(doc, content)
|
||
|
||
# Parse content line by line in exact sequence
|
||
lines = content.split('\n')
|
||
|
||
for line in lines:
|
||
line = line.strip()
|
||
if not line:
|
||
# Empty line - add paragraph break
|
||
doc.add_paragraph()
|
||
continue
|
||
|
||
# Skip table placeholders (already processed)
|
||
if line.startswith('[TABLE_INSERTED_'):
|
||
continue
|
||
|
||
# Check if this is a Markdown heading (# ## ###)
|
||
if line.startswith('#'):
|
||
level = len(line) - len(line.lstrip('#'))
|
||
heading_text = line.lstrip('# ').strip()
|
||
doc.add_heading(heading_text, level=min(level, 3))
|
||
|
||
# Check if this is a numbered heading (1) Title, 2) Title, etc.)
|
||
elif re.match(r'^\d+\)\s+.+', line):
|
||
heading_text = re.sub(r'^\d+\)\s+', '', line)
|
||
doc.add_heading(heading_text, level=1)
|
||
|
||
# Check if this is a Markdown list item
|
||
elif line.startswith('- ') or re.match(r'^\d+\.\s+', line):
|
||
bullet_text = re.sub(r'^[-•]\s+|\d+\.\s+', '', line)
|
||
self._add_bullet_point(doc, bullet_text)
|
||
|
||
# Check if this is a code block
|
||
elif line.startswith('```'):
|
||
if not line.endswith('```'):
|
||
# Start of code block - collect until end
|
||
code_lines = [line]
|
||
continue
|
||
else:
|
||
# End of code block
|
||
if 'code_lines' in locals():
|
||
code_lines.append(line)
|
||
code_text = '\n'.join(code_lines)
|
||
para = doc.add_paragraph()
|
||
run = para.add_run(code_text)
|
||
run.font.name = 'Courier New'
|
||
del code_lines
|
||
|
||
# Regular paragraph
|
||
else:
|
||
self._addParagraphToDoc(doc, line)
|
||
|
||
def _addParagraphToDoc(self, doc, text: str):
|
||
"""Add a paragraph to the document with proper formatting."""
|
||
if not text.strip():
|
||
return
|
||
|
||
# Check for Markdown formatting (**bold**, *italic*)
|
||
para = doc.add_paragraph()
|
||
|
||
# Split by bold markers
|
||
parts = text.split('**')
|
||
for i, part in enumerate(parts):
|
||
if i % 2 == 0:
|
||
# Regular text - check for italic
|
||
italic_parts = part.split('*')
|
||
for j, italic_part in enumerate(italic_parts):
|
||
if j % 2 == 0:
|
||
# Regular text
|
||
if italic_part:
|
||
para.add_run(italic_part)
|
||
else:
|
||
# Italic text
|
||
if italic_part:
|
||
run = para.add_run(italic_part)
|
||
run.italic = True
|
||
else:
|
||
# Bold text
|
||
if part:
|
||
run = para.add_run(part)
|
||
run.bold = True |