refined generation prompt

2026-01-06 21:24:54 +01:00 · 2026-01-06 21:24:54 +01:00 · 043ffe1044
commit 043ffe1044
parent 2c964b254b
9 changed files with 458 additions and 30 deletions
--- a/modules/services/serviceAi/subStructureFilling.py
+++ b/modules/services/serviceAi/subStructureFilling.py
@ -1837,11 +1837,9 @@ When determining which sections to create for this chapter, consider the documen
 - Select content types that are well-suited for the target format
 - **CRITICAL**: Only use section types from the ACCEPTED SECTION TYPES list above

-useAiCall RULES:
- useAiCall: true ONLY if ContentPart Format is "extracted" AND transformation needed
- useAiCall: false if Format is "object" or "reference" (direct insertion)
- useAiCall: false if Format is "extracted" AND simple "include full text" instruction
- useAiCall: true if no ContentPartIds provided (content must be generated from scratch); Sections without ContentParts must have a clear, detailed generationHint explaining what content to generate
+useAiCall RULE (simple):
+- useAiCall: true → Content needs AI processing (extract, transform, generate, filter, summarize)
+- useAiCall: false → Content can be inserted directly without changes (Format is "object" or "reference")

 RETURN JSON:
 {{
@ -1849,10 +1847,9 @@ RETURN JSON:
    {{
      "id": "section_1",
      "content_type": "paragraph",
-      "contentPartIds": ["extracted_part_1"],
-      "generationHint": "Include full text",
-      "useAiCall": false,
-      "caption": "optional, only for image sections",
+      "contentPartIds": ["extracted_part_id"],
+      "generationHint": "Description of what to extract or generate",
+      "useAiCall": true,
      "elements": []
    }}
  ]
--- a/modules/services/serviceAi/subStructureGeneration.py
+++ b/modules/services/serviceAi/subStructureGeneration.py
@ -418,13 +418,24 @@ Then chapters that generate those generic content types MUST assign the relevant

 ## CHAPTER STRUCTURE REQUIREMENTS
 - Generate chapters based on USER REQUEST - analyze what structure the user wants
- Each chapter needs: id, level (1, 2, 3, etc.), title
+- IMPORTANT: Each chapter MUST have ALL these fields:
+  - id: Unique identifier (e.g., "chapter_1")
+  - level: Heading level (1, 2, 3, etc.)
+  - title: Chapter title
+  - contentParts: Object mapping ContentPart IDs to usage instructions
+  - generationHint: Description of what content to generate
+  - sections: Empty array [] (REQUIRED - sections are generated in next phase)
 - contentParts: {{"partId": {{"instruction": "..."}} or {{"caption": "..."}} or both}} - Assign ContentParts as required by CONTENT ASSIGNMENT RULE above
 - The "instruction" field for each ContentPart MUST contain ALL relevant details from the USER REQUEST that apply to content extraction for this specific chapter. Include all formatting rules, data requirements, constraints, and specifications mentioned in the user request that are relevant for processing this ContentPart in this chapter.
 - generationHint: Description of what content to generate for this chapter
  The generationHint MUST contain ALL relevant details from the USER REQUEST that apply to this specific chapter. Include all formatting rules, data requirements, constraints, column specifications, validation rules, and any other specifications mentioned in the user request that are relevant for generating content for this chapter. Do NOT use generic descriptions - include specific details from the user request.
 - The number of chapters depends on the user request - create only what is requested

+## WHAT IS A CHAPTER vs WHAT IS FORMATTING
+- A CHAPTER contains CONTENT (text, tables, lists, images, etc.)
+- FORMATTING INSTRUCTIONS (CSS styling, spacing, typography, colors, borders) are NOT separate chapters
+- If user mentions formatting topics, apply these to ALL chapters via generationHint, do NOT create a separate "Formatting" chapter
+
 ## DOCUMENT OUTPUT FORMAT
 For each document, determine the output format by analyzing the USER REQUEST:
 - Look for explicit format mentions
@ -463,7 +474,7 @@ For each document, determine the language by analyzing the USER REQUEST:
    - title: Chapter title
    - contentParts: Object mapping ContentPart IDs to usage instructions {{"partId": {{"instruction": "..."}} or {{"caption": "..."}}}}
    - generationHint: Description of what content to generate
-    - sections: Empty array []
+    - sections: Empty array [] (MANDATORY - always include this field)

 EXAMPLE STRUCTURE (for reference only - adapt to user request):
 {{
--- a/modules/services/serviceGeneration/renderers/rendererDocx.py
+++ b/modules/services/serviceGeneration/renderers/rendererDocx.py
@ -474,17 +474,6 @@ class RendererDocx(BaseRenderer):
        # Add table properties
        tblPr = OxmlElement('w:tblPr')
        
-        # Table style
-        border_style = styles.get("table_border", {}).get("style", "grid")
-        tblStyle = OxmlElement('w:tblStyle')
-        if border_style == "grid":
-            tblStyle.set(qn('w:val'), 'LightGridAccent1')
-        elif border_style == "horizontal_only":
-            tblStyle.set(qn('w:val'), 'LightListAccent1')
-        else:
-            tblStyle.set(qn('w:val'), 'LightList')
-        tblPr.append(tblStyle)
-        
        # Table width - auto
        tblW = OxmlElement('w:tblW')
        tblW.set(qn('w:type'), 'auto')
@ -496,6 +485,20 @@ class RendererDocx(BaseRenderer):
        jc.set(qn('w:val'), 'center')
        tblPr.append(jc)
        
+        # Apply table borders directly (works without template styles)
+        borderStyle = styles.get("table_border", {}).get("style", "grid")
+        tblBorders = self._createTableBordersXml(borderStyle)
+        tblPr.append(tblBorders)
+        
+        # Table cell margins for better readability
+        tblCellMar = OxmlElement('w:tblCellMar')
+        for side in ['top', 'left', 'bottom', 'right']:
+            margin = OxmlElement(f'w:{side}')
+            margin.set(qn('w:w'), '80')  # 80 twips = ~4pt padding
+            margin.set(qn('w:type'), 'dxa')
+            tblCellMar.append(margin)
+        tblPr.append(tblCellMar)
+        
        tbl.append(tblPr)
        
        # Create table grid (column definitions)
@ -548,6 +551,59 @@ class RendererDocx(BaseRenderer):
        
        self.logger.debug(f"_renderTableFastXml: All rows created in {data_time:.2f}s, total: {total_time:.2f}s, rate: {rate:.0f} cells/s")
    
+    def _createTableBordersXml(self, borderStyle: str) -> Any:
+        """
+        Create table borders XML element based on style.
+        
+        Supports:
+        - 'grid': Full grid with all borders (default)
+        - 'horizontal_only': Only horizontal lines between rows
+        - 'none' or other: Minimal/no borders
+        """
+        from docx.oxml.shared import OxmlElement, qn
+        
+        tblBorders = OxmlElement('w:tblBorders')
+        
+        # Border color - dark gray for professional look
+        borderColor = '404040'
+        borderSize = '4'  # 0.5pt (in eighths of a point)
+        
+        if borderStyle == "grid":
+            # Full grid - all borders
+            for borderName in ['top', 'left', 'bottom', 'right', 'insideH', 'insideV']:
+                border = OxmlElement(f'w:{borderName}')
+                border.set(qn('w:val'), 'single')
+                border.set(qn('w:sz'), borderSize)
+                border.set(qn('w:space'), '0')
+                border.set(qn('w:color'), borderColor)
+                tblBorders.append(border)
+                
+        elif borderStyle == "horizontal_only":
+            # Only horizontal lines
+            for borderName in ['top', 'bottom', 'insideH']:
+                border = OxmlElement(f'w:{borderName}')
+                border.set(qn('w:val'), 'single')
+                border.set(qn('w:sz'), borderSize)
+                border.set(qn('w:space'), '0')
+                border.set(qn('w:color'), borderColor)
+                tblBorders.append(border)
+            # No vertical borders
+            for borderName in ['left', 'right', 'insideV']:
+                border = OxmlElement(f'w:{borderName}')
+                border.set(qn('w:val'), 'nil')
+                tblBorders.append(border)
+        else:
+            # Minimal - just outer border
+            for borderName in ['top', 'left', 'bottom', 'right']:
+                border = OxmlElement(f'w:{borderName}')
+                border.set(qn('w:val'), 'single')
+                border.set(qn('w:sz'), borderSize)
+                border.set(qn('w:space'), '0')
+                border.set(qn('w:color'), borderColor)
+                tblBorders.append(border)
+        
+        return tblBorders
+    
    def _createTableRowXml(self, cells: List[str], isHeader: bool = False) -> Any:
        """
        Create a table row XML element with cells.
@ -570,12 +626,21 @@ class RendererDocx(BaseRenderer):
            # Create cell
            tc = OxmlElement('w:tc')
            
-            # Cell properties (minimal)
+            # Cell properties
            tcPr = OxmlElement('w:tcPr')
            tcW = OxmlElement('w:tcW')
            tcW.set(qn('w:type'), 'auto')
            tcW.set(qn('w:w'), '0')
            tcPr.append(tcW)
+            
+            # Header cell styling - light blue background
+            if isHeader:
+                shd = OxmlElement('w:shd')
+                shd.set(qn('w:val'), 'clear')
+                shd.set(qn('w:color'), 'auto')
+                shd.set(qn('w:fill'), '4472C4')  # Professional blue
+                tcPr.append(shd)
+            
            tc.append(tcPr)
            
            # Paragraph with text
@ -584,11 +649,15 @@ class RendererDocx(BaseRenderer):
            # Add run with text
            r = OxmlElement('w:r')
            
-            # Bold for headers
+            # Header text styling - bold and white
            if isHeader:
                rPr = OxmlElement('w:rPr')
                b = OxmlElement('w:b')
                rPr.append(b)
+                # White text color
+                color = OxmlElement('w:color')
+                color.set(qn('w:val'), 'FFFFFF')
+                rPr.append(color)
                r.append(rPr)
            
            # Text element
--- a/modules/shared/jsonContinuation.py
+++ b/modules/shared/jsonContinuation.py
@ -1748,6 +1748,13 @@ def _repairInternalJsonErrors(jsonStr: str) -> str:
    # Fix 6: Fix unquoted keys (simple cases only)
    result = _fixUnquotedKeys(result)
    
+    # Fix 7: Fix unescaped quotes inside string values
+    # This handles AI-generated JSON with quotes like: "text with "quoted" words"
+    result = _fixUnescapedQuotesInStrings(result)
+    
+    # Fix 8: Fix unescaped control characters (ASCII 0-31)
+    result = _fixUnescapedControlCharacters(result)
+    
    return result


@ -1946,6 +1953,162 @@ def _fixUnquotedKeys(jsonStr: str) -> str:
    return ''.join(result)


+def _fixUnescapedQuotesInStrings(jsonStr: str) -> str:
+    """
+    Fix unescaped quotes inside JSON string values.
+    
+    AI often generates JSON with unescaped quotes like:
+        "text with "quoted" words"
+    
+    This should be:
+        "text with \"quoted\" words"
+    
+    Strategy:
+    - Parse JSON structure to find string values
+    - Within a string, find unescaped quotes that are followed by content
+      that looks like it continues the string (not a : or , or } or ])
+    - Escape those quotes
+    """
+    if not jsonStr or not jsonStr.strip():
+        return jsonStr
+    
+    result = []
+    i = 0
+    inString = False
+    stringStart = -1
+    escaped = False
+    
+    while i < len(jsonStr):
+        char = jsonStr[i]
+        
+        if escaped:
+            result.append(char)
+            escaped = False
+            i += 1
+            continue
+        
+        if char == '\\' and inString:
+            result.append(char)
+            escaped = True
+            i += 1
+            continue
+        
+        if char == '"':
+            if not inString:
+                # Starting a string
+                inString = True
+                stringStart = i
+                result.append(char)
+                i += 1
+                continue
+            else:
+                # Could be end of string OR unescaped quote inside string
+                # Look ahead to determine
+                nextNonSpace = i + 1
+                while nextNonSpace < len(jsonStr) and jsonStr[nextNonSpace] in ' \t\n\r':
+                    nextNonSpace += 1
+                
+                if nextNonSpace < len(jsonStr):
+                    nextChar = jsonStr[nextNonSpace]
+                    
+                    # If next char is a structural character, this is end of string
+                    if nextChar in ':,}]':
+                        inString = False
+                        result.append(char)
+                        i += 1
+                        continue
+                    
+                    # If next char is a quote, might be end of string followed by another string
+                    # Check if we're at a reasonable string end (has a colon or comma before next structure)
+                    if nextChar == '"':
+                        # This is end of string, start of next
+                        inString = False
+                        result.append(char)
+                        i += 1
+                        continue
+                    
+                    # Otherwise, this quote is INSIDE the string - escape it!
+                    result.append('\\')
+                    result.append(char)
+                    i += 1
+                    continue
+                else:
+                    # End of JSON - this must be closing quote
+                    inString = False
+                    result.append(char)
+                    i += 1
+                    continue
+        
+        result.append(char)
+        i += 1
+    
+    return ''.join(result)
+
+
+def _fixUnescapedControlCharacters(jsonStr: str) -> str:
+    """
+    Fix unescaped control characters in JSON strings.
+    
+    JSON requires control characters (ASCII 0-31) to be escaped as \\uXXXX.
+    Common ones have shortcuts: \\n, \\r, \\t, \\b, \\f
+    
+    This function finds unescaped control chars inside strings and escapes them.
+    """
+    if not jsonStr or not jsonStr.strip():
+        return jsonStr
+    
+    result = []
+    i = 0
+    inString = False
+    escaped = False
+    
+    # Mapping of common control chars to their escape sequences
+    controlEscapes = {
+        '\n': '\\n',
+        '\r': '\\r',
+        '\t': '\\t',
+        '\b': '\\b',
+        '\f': '\\f',
+    }
+    
+    while i < len(jsonStr):
+        char = jsonStr[i]
+        
+        if escaped:
+            result.append(char)
+            escaped = False
+            i += 1
+            continue
+        
+        if char == '\\' and inString:
+            result.append(char)
+            escaped = True
+            i += 1
+            continue
+        
+        if char == '"':
+            inString = not inString
+            result.append(char)
+            i += 1
+            continue
+        
+        if inString:
+            # Check for control characters (ASCII 0-31)
+            if ord(char) < 32:
+                if char in controlEscapes:
+                    result.append(controlEscapes[char])
+                else:
+                    # Use \uXXXX format for other control chars
+                    result.append(f'\\u{ord(char):04x}')
+                i += 1
+                continue
+        
+        result.append(char)
+        i += 1
+    
+    return ''.join(result)
+
+
 def _tryParseJson(jsonStr: str) -> tuple:
    """
    Try to parse JSON string and return (parsed, error).
--- a/modules/shared/jsonUtils.py
+++ b/modules/shared/jsonUtils.py
@ -123,6 +123,160 @@ def tryParseJson(text: Union[str, bytes]) -> Tuple[Optional[Union[Dict, List]],
        return None, e, cleaned


+def _fixUnescapedQuotesInStrings(jsonStr: str) -> str:
+    """
+    Fix unescaped quotes inside JSON string values.
+    
+    AI often generates JSON with unescaped quotes like:
+        "text with "quoted" words"
+    
+    This should be:
+        "text with \"quoted\" words"
+    
+    Strategy:
+    - Parse JSON structure to find string values
+    - Within a string, find unescaped quotes that are followed by content
+      that looks like it continues the string (not a : or , or } or ])
+    - Escape those quotes
+    """
+    if not jsonStr or not jsonStr.strip():
+        return jsonStr
+    
+    result = []
+    i = 0
+    inString = False
+    escaped = False
+    
+    while i < len(jsonStr):
+        char = jsonStr[i]
+        
+        if escaped:
+            result.append(char)
+            escaped = False
+            i += 1
+            continue
+        
+        if char == '\\' and inString:
+            result.append(char)
+            escaped = True
+            i += 1
+            continue
+        
+        if char == '"':
+            if not inString:
+                # Starting a string
+                inString = True
+                result.append(char)
+                i += 1
+                continue
+            else:
+                # Could be end of string OR unescaped quote inside string
+                # Look ahead to determine
+                nextNonSpace = i + 1
+                while nextNonSpace < len(jsonStr) and jsonStr[nextNonSpace] in ' \t\n\r':
+                    nextNonSpace += 1
+                
+                if nextNonSpace < len(jsonStr):
+                    nextChar = jsonStr[nextNonSpace]
+                    
+                    # If next char is a structural character, this is end of string
+                    if nextChar in ':,}]':
+                        inString = False
+                        result.append(char)
+                        i += 1
+                        continue
+                    
+                    # If next char is a quote, might be end of string followed by another string
+                    # Check if we're at a reasonable string end (has a colon or comma before next structure)
+                    if nextChar == '"':
+                        # This is end of string, start of next
+                        inString = False
+                        result.append(char)
+                        i += 1
+                        continue
+                    
+                    # Otherwise, this quote is INSIDE the string - escape it!
+                    result.append('\\')
+                    result.append(char)
+                    i += 1
+                    continue
+                else:
+                    # End of JSON - this must be closing quote
+                    inString = False
+                    result.append(char)
+                    i += 1
+                    continue
+        
+        result.append(char)
+        i += 1
+    
+    return ''.join(result)
+
+
+def _fixUnescapedControlCharacters(jsonStr: str) -> str:
+    """
+    Fix unescaped control characters in JSON strings.
+    
+    JSON requires control characters (ASCII 0-31) to be escaped as \\uXXXX.
+    Common ones have shortcuts: \\n, \\r, \\t, \\b, \\f
+    
+    This function finds unescaped control chars inside strings and escapes them.
+    """
+    if not jsonStr or not jsonStr.strip():
+        return jsonStr
+    
+    result = []
+    i = 0
+    inString = False
+    escaped = False
+    
+    # Mapping of common control chars to their escape sequences
+    controlEscapes = {
+        '\n': '\\n',
+        '\r': '\\r',
+        '\t': '\\t',
+        '\b': '\\b',
+        '\f': '\\f',
+    }
+    
+    while i < len(jsonStr):
+        char = jsonStr[i]
+        
+        if escaped:
+            result.append(char)
+            escaped = False
+            i += 1
+            continue
+        
+        if char == '\\' and inString:
+            result.append(char)
+            escaped = True
+            i += 1
+            continue
+        
+        if char == '"':
+            inString = not inString
+            result.append(char)
+            i += 1
+            continue
+        
+        if inString:
+            # Check for control characters (ASCII 0-31)
+            if ord(char) < 32:
+                if char in controlEscapes:
+                    result.append(controlEscapes[char])
+                else:
+                    # Use \uXXXX format for other control chars
+                    result.append(f'\\u{ord(char):04x}')
+                i += 1
+                continue
+        
+        result.append(char)
+        i += 1
+    
+    return ''.join(result)
+
+
 def repairBrokenJson(text: str) -> Optional[Dict[str, Any]]:
    """
    Attempt to repair broken JSON using multiple strategies.
@ -135,6 +289,11 @@ def repairBrokenJson(text: str) -> Optional[Dict[str, Any]]:
    if not text:
        return None
    
+    # Pre-processing: Fix unescaped quotes and control characters inside strings
+    # AI often generates JSON like: "text with "quoted" words"
+    text = _fixUnescapedQuotesInStrings(text)
+    text = _fixUnescapedControlCharacters(text)
+    
    # Strategy 1: Structure closing - close incomplete structures WITHOUT truncating
    # This preserves all data and should be tried first
    closedStr = closeJsonStructures(text)
--- a/modules/workflows/methods/methodAi/actions/convertDocument.py
+++ b/modules/workflows/methods/methodAi/actions/convertDocument.py
@ -26,9 +26,16 @@ async def convertDocument(self, parameters: Dict[str, Any]) -> ActionResult:
        aiPrompt += " Preserve all document structure including headings, tables, formatting, lists, and layout."
    aiPrompt += " Ensure the converted document maintains the same content and information as the original."
    
-    return await self.process({
+    # Pass parentOperationId to maintain progress hierarchy
+    parentOperationId = parameters.get("parentOperationId")
+    
+    processParams = {
        "aiPrompt": aiPrompt,
        "documentList": documentList,
        "resultType": normalizedFormat
-    })
+    }
+    if parentOperationId:
+        processParams["parentOperationId"] = parentOperationId
+    
+    return await self.process(processParams)

--- a/modules/workflows/methods/methodAi/actions/summarizeDocument.py
+++ b/modules/workflows/methods/methodAi/actions/summarizeDocument.py
@ -28,10 +28,17 @@ async def summarizeDocument(self, parameters: Dict[str, Any]) -> ActionResult:
        aiPrompt += f" Focus specifically on: {focus}."
    aiPrompt += " Extract and present the key points, main ideas, and important information in a clear, well-structured format."
    
-    return await self.process({
+    # Pass parentOperationId to maintain progress hierarchy
+    parentOperationId = parameters.get("parentOperationId")
+    
+    processParams = {
        "aiPrompt": aiPrompt,
        "documentList": documentList,
        "resultType": resultType,
        "generationIntent": "document"  # NEW: Explicit intent
-    })
+    }
+    if parentOperationId:
+        processParams["parentOperationId"] = parentOperationId
+    
+    return await self.process(processParams)

--- a/modules/workflows/methods/methodAi/actions/translateDocument.py
+++ b/modules/workflows/methods/methodAi/actions/translateDocument.py
@ -29,6 +29,9 @@ async def translateDocument(self, parameters: Dict[str, Any]) -> ActionResult:
        aiPrompt += " Focus on accurate translation of content."
    aiPrompt += " Maintain the same document structure, headings, and organization."
    
+    # Pass parentOperationId to maintain progress hierarchy
+    parentOperationId = parameters.get("parentOperationId")
+    
    processParams = {
        "aiPrompt": aiPrompt,
        "documentList": documentList,
@ -36,6 +39,8 @@ async def translateDocument(self, parameters: Dict[str, Any]) -> ActionResult:
    }
    if resultType:
        processParams["resultType"] = resultType
+    if parentOperationId:
+        processParams["parentOperationId"] = parentOperationId
    
    return await self.process(processParams)

--- a/modules/workflows/processing/adaptive/contentValidator.py
+++ b/modules/workflows/processing/adaptive/contentValidator.py
@ -171,6 +171,7 @@ class ContentValidator:
                        # Include any additional fields from section (generic approach)
                        # This ensures all action-specific fields are preserved
                        # BUT exclude type-specific KPIs that don't belong to this content_type
+                        # AND exclude internal planning fields that confuse validation
                        contentType = section.get("content_type", "")
                        # Define KPIs that are ONLY valid for specific types
                        typeExclusiveKpis = {
@ -183,8 +184,12 @@ class ContentValidator:
                            if kpiType != contentType:
                                excludedKpis.extend(kpiFields)
                        
+                        # Internal planning fields that should NOT be shown to validation AI
+                        # These are implementation details, not content indicators
+                        internalFields = ["generationHint", "useAiCall", "elements"]
+                        
                        for key, value in section.items():
-                            if key not in sectionSummary and key not in ["elements"] and key not in excludedKpis:
+                            if key not in sectionSummary and key not in internalFields and key not in excludedKpis:
                                # Don't copy type-specific KPIs if they're 0/empty and we didn't extract them ourselves
                                # This prevents copying columnCount: 0, rowCount: 0, headers: [] from structure generation phase
                                if key in ["columnCount", "rowCount", "headers", "itemCount"]:
@ -309,6 +314,7 @@ class ContentValidator:
                        
                        # Include any additional fields from section (generic approach)
                        # BUT exclude type-specific KPIs that don't belong to this content_type
+                        # AND exclude internal planning fields that confuse validation
                        contentType = section.get("content_type", "")
                        # Define KPIs that are ONLY valid for specific types
                        typeExclusiveKpis = {
@ -321,8 +327,12 @@ class ContentValidator:
                            if kpiType != contentType:
                                excludedKpis.extend(kpiFields)
                        
+                        # Internal planning fields that should NOT be shown to validation AI
+                        # These are implementation details, not content indicators
+                        internalFields = ["generationHint", "useAiCall", "elements"]
+                        
                        for key, value in section.items():
-                            if key not in sectionSummary and key not in ["elements"] and key not in excludedKpis:
+                            if key not in sectionSummary and key not in internalFields and key not in excludedKpis:
                                # Don't copy type-specific KPIs if they're 0/empty and we didn't extract them ourselves
                                # This prevents copying columnCount: 0, rowCount: 0, headers: [] from structure generation phase
                                if key in ["columnCount", "rowCount", "headers", "itemCount"]: