proper context cascade from extraction to generation

2026-01-02 01:55:04 +01:00 · 2026-01-02 01:55:04 +01:00 · 53819f90c3
commit 53819f90c3
parent 64b44473aa
3 changed files with 218 additions and 76 deletions
--- a/modules/services/serviceAi/subStructureFilling.py
+++ b/modules/services/serviceAi/subStructureFilling.py
@ -596,6 +596,17 @@ class StructureFiller:
                            try:
                                extractionPrompt = part.metadata.get("extractionPrompt") or "Extract all text content from this image. Return only the extracted text, no additional formatting."
                                # Write debug file for image extraction prompt
                                if self.services and hasattr(self.services, 'utils') and hasattr(self.services.utils, 'writeDebugFile'):
                                    try:
                                        partId = part.id[:8] if part.id else "unknown"
                                        partLabelSafe = (part.label or "image").replace(" ", "_").replace("/", "_").replace("\\", "_")[:30]
                                        debugPrefix = f"extraction_image_{partId}_{partLabelSafe}"
                                        self.services.utils.writeDebugFile(extractionPrompt, f"{debugPrefix}_prompt")
                                        logger.debug(f"Wrote image extraction prompt debug file: {debugPrefix}_prompt")
                                    except Exception as debugError:
                                        logger.warning(f"Failed to write image extraction debug file: {str(debugError)}")
                                # Call Vision AI to extract text from image
                                visionRequest = AiCallRequest(
                                    prompt=extractionPrompt,
@ -606,6 +617,18 @@ class StructureFiller:
                                visionResponse = await self.aiService.callAi(visionRequest)
                                # Write debug file for image extraction response
                                if self.services and hasattr(self.services, 'utils') and hasattr(self.services.utils, 'writeDebugFile'):
                                    try:
                                        partId = part.id[:8] if part.id else "unknown"
                                        partLabelSafe = (part.label or "image").replace(" ", "_").replace("/", "_").replace("\\", "_")[:30]
                                        debugPrefix = f"extraction_image_{partId}_{partLabelSafe}"
                                        responseContent = visionResponse.content if visionResponse and visionResponse.content else ""
                                        self.services.utils.writeDebugFile(responseContent, f"{debugPrefix}_response")
                                        logger.debug(f"Wrote image extraction response debug file: {debugPrefix}_response")
                                    except Exception as debugError:
                                        logger.warning(f"Failed to write image extraction response debug file: {str(debugError)}")
                                if visionResponse and visionResponse.content:
                                    # Create text part with extracted content
                                    textPart = ContentPart(
@ -1573,7 +1596,7 @@ The JSON should be a fragment that can be merged with the previous response."""
        Flattening: Konvertiert Chapters zu finaler Section-Struktur.
        Jedes Chapter wird zu einer Heading-Section (Level 1) + dessen Sections.
-        IMPORTANT: Chapters are the main structure elements (heading level 1).
+        Chapters are the main structure elements (heading level 1).
        All section headings with level < 2 are adjusted to level 2.
        """
        result = {
@ -1674,7 +1697,7 @@ GENERATION HINT: {generationHint}
 NOTE: Chapter already has a heading section. Do NOT generate a heading for the chapter title.
-IMPORTANT - SECTION INDEPENDENCE:
+## SECTION INDEPENDENCE
 - Each section is independent and self-contained
 - One section does NOT have information about another section
 - Each section must provide its own context and be understandable alone
@ -1688,7 +1711,7 @@ useAiCall RULES:
 - useAiCall: true ONLY if ContentPart Format is "extracted" AND transformation needed
 - useAiCall: false if Format is "object" or "reference" (direct insertion)
 - useAiCall: false if Format is "extracted" AND simple "include full text" instruction
- useAiCall: true if NO ContentPartIds provided (content must be generated from scratch); Sections without ContentParts MUST have a clear, detailed generationHint explaining what content to generate
+- useAiCall: true if no ContentPartIds provided (content must be generated from scratch); Sections without ContentParts must have a clear, detailed generationHint explaining what content to generate
 RETURN JSON:
 {{
@ -1714,7 +1737,7 @@ EXAMPLES (all content types):
 - reference: {{"id": "s7", "content_type": "paragraph", "contentPartIds": ["ref_1"], "generationHint": "Reference", "useAiCall": false, "elements": []}}
 - NO CONTENT PARTS (generate from scratch): {{"id": "s8", "content_type": "paragraph", "contentPartIds": [], "generationHint": "Write a detailed professional paragraph explaining [specific topic or purpose]. Include [key points to cover]. Address [important aspects]. Conclude with [summary or recommendations].", "useAiCall": true, "elements": []}}
-CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside the JSON.
+Return only valid JSON. Do not include any explanatory text outside the JSON.
 """
        return prompt
@ -1752,19 +1775,9 @@ CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside th
        # Baue ContentParts-Beschreibung
        contentPartsText = ""
        if isAggregation:
-            # Aggregation: Zeige nur Metadaten, nicht Previews
+            # Aggregation: ContentParts werden als Parameter übergeben, keine IDs im Prompt nötig
-            contentPartsText += f"\n## CONTENT PARTS (Aggregation)\n"
+            # Keine ContentPart-Beschreibung nötig - Daten sind bereits im Context verfügbar
-            contentPartsText += f"- Anzahl: {len(validParts)} ContentParts\n"
+            contentPartsText = ""
            contentPartsText += f"- Alle ContentParts werden als Parameter übergeben (nicht im Prompt!)\n"
            contentPartsText += f"- Jeder Part kann sehr groß sein → Chunking automatisch\n"
            contentPartsText += f"- WICHTIG: Aggregiere ALLE Parts zu einem Element (z.B. eine Tabelle)\n\n"
            contentPartsText += f"ContentPart IDs:\n"
            for part in validParts:
                contentFormat = part.metadata.get("contentFormat", "unknown")
                contentPartsText += f"  - {part.id} (Format: {contentFormat}, Type: {part.typeGroup}"
                if part.metadata.get("originalFileName"):
                    contentPartsText += f", Source: {part.metadata.get('originalFileName')}"
                contentPartsText += ")\n"
        else:
            # Einzelverarbeitung: Zeige Previews
            for part in validParts:
@ -1838,6 +1851,9 @@ CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside th
        if isAggregation:
            prompt = f"""# TASK: Generate Section Content (Aggregation)
 Return only valid JSON. No explanatory text, no comments, no markdown formatting outside JSON.
 If ContentParts have no data, return: {{"elements": [{{"type": "{contentType}", "content": {{"headers": [], "rows": []}}}}]}}
 LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
 ## SECTION METADATA
@ -1845,22 +1861,14 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
 - Content Type: {contentType}
 - Generation Hint: {generationHint}
 ## AVAILABLE CONTENT FOR THIS SECTION
 {contentPartsText if contentPartsText else "(No content parts specified for this section)"}
 ## INSTRUCTIONS
-1. Generate content for section "{sectionId}" based on the generation hint above
+1. Extract all data from the context provided. Do not skip or omit any data.
-2. **AGGREGATION**: Combine ALL provided ContentParts into ONE element (e.g., one table with all data)
+2. Extract data only from the provided context. Never invent, create, or generate data that is not in the context.
-3. For table content_type: Create a single table with headers and rows from all ContentParts
+3. If the context contains no data, return empty structures (empty rows array for tables).
-4. For bullet_list content_type: Create a single list with items from all ContentParts
+4. Aggregate all data into one element (e.g., one table).
-5. Format appropriately based on content_type ({contentType})
+5. For table: Extract all rows from the context. Return {{"headers": [...], "rows": []}} only if no data exists.
-6. Ensure the generated content is self-contained and understandable independently
+6. Format based on content_type ({contentType}).
-7. Return ONLY a JSON object with an "elements" array
+7. No HTML/styling: Plain text only, no markup.
 8. Each element should match the content_type: {contentType}
 9. CRITICAL - NO HTML/STYLING: Do NOT include HTML tags, CSS styles, or any formatting markup in text content. Return plain text only. Formatting is handled automatically by the renderer.
 10. For paragraphs: Return plain text only, no HTML tags like <div>, <span>, <p>, or style attributes
 11. For headings: Return plain text only, no HTML tags or styling
 12. For images: Do NOT include base64 data in JSON - images are handled separately
 ## OUTPUT FORMAT
 Return a JSON object with this structure:
@ -1874,16 +1882,19 @@ Return a JSON object with this structure:
  ]
 }}
-CRITICAL: 
+Output requirements:
- "content" MUST always be an object (never a string)
+- "content" must be an object (never a string)
- For text content: Return plain text only, NO HTML tags, NO CSS styles, NO formatting markup
+- Return only valid JSON - no text before, no text after, no comments, no explanations
- Return ONLY valid JSON. Do not include any explanatory text outside the JSON.
+- No invented data: Return empty structures if ContentParts have no data
 - Extract all data: Process every ContentPart completely and include all extracted data
-## CONTEXT (for reference only)
+## USER REQUEST (for context)
 {contextText if contextText else ""}
 ```
 {userPrompt}
 ```
 ## CONTEXT
 {contextText if contextText else ""}
 """
        else:
            prompt = f"""# TASK: Generate Section Content
@ -1899,16 +1910,11 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
 {contentPartsText if contentPartsText else "(No content parts specified for this section)"}
 ## INSTRUCTIONS
-1. Generate content for section "{sectionId}" based on the generation hint above
+1. Extract data only from provided ContentParts. Never invent or generate data.
-2. Use the available content parts to populate this section
+2. If ContentParts contain no data, return empty structures (empty rows array for tables).
-3. For extracted text: Format appropriately based on content_type ({contentType})
+3. Format based on content_type ({contentType}).
-4. Ensure the generated content is self-contained and understandable independently
+4. Return only valid JSON with "elements" array.
-5. Return ONLY a JSON object with an "elements" array
+5. No HTML/styling: Plain text only, no markup.
 6. Each element should match the content_type: {contentType}
 7. CRITICAL - NO HTML/STYLING: Do NOT include HTML tags, CSS styles, or any formatting markup in text content. Return plain text only. Formatting is handled automatically by the renderer.
 8. For paragraphs: Return plain text only, no HTML tags like <div>, <span>, <p>, or style attributes
 9. For headings: Return plain text only, no HTML tags or styling
 10. For images: If you need to reference an image, describe it in altText. Do NOT include base64 data - images are handled separately
 ## OUTPUT FORMAT
 Return a JSON object with this structure:
@ -1922,16 +1928,18 @@ Return a JSON object with this structure:
  ]
 }}
-CRITICAL: 
+Output requirements:
- "content" MUST always be an object (never a string)
+- "content" must be an object (never a string)
- For text content: Return plain text only, NO HTML tags, NO CSS styles, NO formatting markup
+- Return only valid JSON, no explanatory text
- Return ONLY valid JSON. Do not include any explanatory text outside the JSON
+- No invented data: Return empty structures if ContentParts have no data
-## CONTEXT (for reference only)
+## USER REQUEST
 {contextText if contextText else ""}
 ```
 {userPrompt}
 ```
 ## CONTEXT
 {contextText if contextText else ""}
 """
        return prompt
--- a/modules/services/serviceAi/subStructureGeneration.py
+++ b/modules/services/serviceAi/subStructureGeneration.py
@ -250,44 +250,45 @@ Continue generating the remaining chapters now.
        language = self._getUserLanguage()
        logger.debug(f"Using language from services (user intention analysis) for structure generation: {language}")
-        prompt = f"""CRITICAL OUTPUT REQUIREMENT: This is a PLANNING task, not a generation task. You MUST return EXACTLY ONE complete JSON object. Do NOT generate multiple JSON objects, alternatives, or variations. Do NOT use separators like "---" between JSON objects. Return the single best JSON structure that matches the requirements below.
+        prompt = f"""# TASK: Generate Chapter Structure
-USER REQUEST (for context):
+This is a PLANNING task. Return EXACTLY ONE complete JSON object. Do not generate multiple JSON objects, alternatives, or variations. Do not use separators like "---" between JSON objects.
 ## USER REQUEST (for context)
 ```
 {userPrompt}
 ```
 LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
-AVAILABLE CONTENT PARTS:
+## AVAILABLE CONTENT PARTS
 {contentPartsIndex}
-TASK: Generate Chapter Structure for the documents to be generated.
+## CHAPTER INDEPENDENCE
 IMPORTANT - CHAPTER INDEPENDENCE:
 - Each chapter is independent and self-contained
 - One chapter does NOT have information about another chapter
 - Each chapter must provide its own context and be understandable alone
-CONTENT ASSIGNMENT:
+## CONTENT ASSIGNMENT
 - Assign ContentParts to chapters via contentParts object
- For data extraction, the type of a contentPart (image, text, etc.) is NOT relevant - only what is specified in the instruction matters
+- For data extraction, the type of a contentPart (image, text, etc.) is not relevant - only what is specified in the instruction matters
- Include ALL relevant parts from same source when needed for structured data extraction
+- Include all relevant parts from same source when needed for structured data extraction
 - Each contentPart can have either:
  - "instruction": For AI extraction prompts (how to process/extract from this part)
  - "caption": For user-facing presentation (how to display/reference this part in the document)
  - Both can be present if needed
 - Chapters without contentParts can only generate generic content (not document-specific)
-FORMATTING:
+## FORMATTING
- Formatting is handled automatically - focus on CONTENT and STRUCTURE only
+- Formatting is handled automatically - focus on content and structure only
-CHAPTER STRUCTURE:
+## CHAPTER STRUCTURE
 - chapter id, level (1, 2, 3, etc.), title
 - contentParts: {{"partId": {{"instruction": "..."}} or {{"caption": "..."}} or both}} - Compact mapping of part IDs to their extraction instructions and/or presentation captions
- generationHint: Self-contained description (if contentParts is empty, must be VERY DETAILED)
+- generationHint: Self-contained description that reflects the user's intent for the specific data. If contentParts is empty, must be detailed. If contentParts are present, the hint should guide how to extract and structure the data according to the user's requirements (e.g., specific columns, format, structure)
-RETURN JSON:
+## OUTPUT FORMAT
 Return JSON:
 {{
  "metadata": {{
    "title": "Document Title",
--- a/modules/services/serviceExtraction/mainServiceExtraction.py
+++ b/modules/services/serviceExtraction/mainServiceExtraction.py
@ -652,6 +652,18 @@ class ExtractionService:
                mergeType="concatenate"
            )
        # Check if this is an elements response format (elements array structure)
        # This is used for section content generation where multiple ContentParts are processed
        isElementsResponse = self._isElementsResponse(content_parts)
        if isElementsResponse:
            # Merge JSON elements responses intelligently (merge tables, combine elements)
            logger.info(f"Detected 'elements' JSON response format - merging {len(content_parts)} JSON responses")
            merged_json = self._mergeElementsResponses(content_parts)
            merged_json_str = json.dumps(merged_json, indent=2, ensure_ascii=False)
            logger.info(f"Successfully merged 'elements' JSON responses into single unified JSON ({len(merged_json_str)} chars)")
            return merged_json_str
        # Check if this is a JSON extraction response format (extracted_content structure)
        # If so, merge JSON structures intelligently before applying regular merging
        isJsonExtractionResponse = self._isJsonExtractionResponse(content_parts)
@ -736,11 +748,112 @@ class ExtractionService:
        return False
    def _isElementsResponse(self, content_parts: List[ContentPart]) -> bool:
        """Check if contentParts contain JSON responses with an 'elements' array (e.g., section content)."""
        if not content_parts:
            return False
        firstPartData = content_parts[0].data if content_parts[0].data else ""
        if not isinstance(firstPartData, str):
            return False
        strippedData = stripCodeFences(firstPartData.strip())
        if not strippedData.startswith(('{', '[')):
            return False
        try:
            parsed = json.loads(strippedData)
            if isinstance(parsed, dict) and "elements" in parsed and isinstance(parsed["elements"], list):
                return True
        except:
            pass
        return False
    def _mergeElementsResponses(self, content_parts: List[ContentPart]) -> Dict[str, Any]:
        """Merge multiple JSON responses with an 'elements' array into one unified response.
        Specifically designed to merge tables within the 'elements' array.
        Empty tables (no rows) are ignored if a table with the same headers already has data.
        """
        merged_elements = []
        table_headers_map: Dict[str, List[Dict[str, Any]]] = {}  # headers_tuple -> [table_contents]
        for part in content_parts:
            if not part.data:
                continue
            # Handle multiple JSON blocks in a single response (separated by ---)
            partDataBlocks = part.data.split('---')
            for blockData in partDataBlocks:
                if not blockData.strip():
                    continue
                try:
                    strippedData = stripCodeFences(blockData.strip())
                    if not strippedData:
                        continue
                    parsed = json.loads(strippedData)
                    if isinstance(parsed, dict) and "elements" in parsed and isinstance(parsed["elements"], list):
                        for element in parsed["elements"]:
                            if isinstance(element, dict) and element.get("type") == "table" and "content" in element:
                                table_content = element["content"]
                                headers = table_content.get("headers", [])
                                rows = table_content.get("rows", [])
                                if headers:
                                    headers_key = tuple(headers)
                                    # If table has no rows, only add it if no table with these headers exists yet
                                    if not rows:
                                        if headers_key not in table_headers_map:
                                            # No table with these headers exists - keep empty table for now
                                            table_headers_map[headers_key] = []
                                        # If a table with these headers already exists (with or without data), skip empty table
                                        continue
                                    # Table has rows - add to merge map
                                    if headers_key not in table_headers_map:
                                        table_headers_map[headers_key] = []
                                    table_headers_map[headers_key].append(table_content)
                            else:
                                # Keep non-table elements as is, but avoid duplicates if possible
                                if element not in merged_elements:
                                    merged_elements.append(element)
                except Exception as e:
                    logger.warning(f"Failed to parse JSON elements response from part {part.id}: {str(e)}")
                    continue
        # Merge tables by headers - combine rows from tables with same headers
        for headers_key, tables in table_headers_map.items():
            if not tables:
                # Only empty tables with these headers - skip them
                continue
            all_rows = []
            for table_content in tables:
                rows = table_content.get("rows", [])
                all_rows.extend(rows)
            # Only add table if it has rows
            if all_rows:
                merged_elements.append({
                    "type": "table",
                    "content": {
                        "headers": list(headers_key),
                        "rows": all_rows
                    }
                })
        return {"elements": merged_elements}
    def _mergeJsonExtractionResponses(self, content_parts: List[ContentPart], originalContentParts: Optional[List[ContentPart]] = None) -> Dict[str, Any]:
        """Merge multiple JSON extraction responses into one unified response.
        Merges:
-        - Tables: Combines all table rows, preserves headers, removes duplicates
+        - Tables: Combines all table rows, preserves headers (duplicates preserved)
        - Text: Combines all text blocks
        - Headings: Combines all headings arrays
        - Lists: Combines all list items
@ -927,16 +1040,10 @@ class ExtractionService:
        for headers_key, tables in table_headers_map.items():
            # Collect all rows from tables with same headers
            all_rows = []
            seen_rows = set()  # Track duplicates using tuple representation
            for table in tables:
                rows = table.get("rows", [])
-                for row in rows:
+                all_rows.extend(rows)
                    # Convert row to tuple for duplicate detection
                    row_tuple = tuple(str(cell) for cell in row)
                    if row_tuple not in seen_rows:
                        seen_rows.add(row_tuple)
                        all_rows.append(row)
            # Create merged table
            if all_rows:
@ -1450,6 +1557,32 @@ class ExtractionService:
                        contentPart, prompt, options, failoverModelList, aiObjects, None  # Don't pass progressCallback to avoid double logging
                    )
                    # Write debug files for generation phase (section content generation)
                    # Check for DATA_GENERATE or DATA_ANALYSE (used for section generation)
                    isGenerationPhase = False
                    if options and hasattr(options, 'operationType'):
                        isGenerationPhase = (options.operationType == OperationTypeEnum.DATA_GENERATE or 
                                            options.operationType == OperationTypeEnum.DATA_ANALYSE)
                    if isGenerationPhase:
                        if self.services and hasattr(self.services, 'utils') and hasattr(self.services.utils, 'writeDebugFile'):
                            try:
                                # Create debug filename with contentPart ID or label
                                partId = contentPart.id[:8] if contentPart.id else f"part_{partIndex+1}"
                                partLabelSafe = (contentPart.label or f"part_{partIndex+1}").replace(" ", "_").replace("/", "_").replace("\\", "_")[:30]
                                debugPrefix = f"generation_contentPart_{partId}_{partLabelSafe}"
                                # Write prompt
                                self.services.utils.writeDebugFile(prompt, f"{debugPrefix}_prompt")
                                # Write response
                                responseContent = partResult.content if partResult.content else ""
                                self.services.utils.writeDebugFile(responseContent, f"{debugPrefix}_response")
                                logger.debug(f"Wrote debug files for contentPart {partId} (generation): {debugPrefix}_prompt, {debugPrefix}_response")
                            except Exception as debugError:
                                logger.warning(f"Failed to write debug file for contentPart {contentPart.id}: {str(debugError)}")
                    # Update completed count and log progress
                    completedCount[0] += 1
                    if progressCallback: