proper context cascade from extraction to generation

This commit is contained in:
ValueOn AG 2026-01-02 01:55:04 +01:00
parent 64b44473aa
commit 53819f90c3
3 changed files with 218 additions and 76 deletions

View file

@ -596,6 +596,17 @@ class StructureFiller:
try: try:
extractionPrompt = part.metadata.get("extractionPrompt") or "Extract all text content from this image. Return only the extracted text, no additional formatting." extractionPrompt = part.metadata.get("extractionPrompt") or "Extract all text content from this image. Return only the extracted text, no additional formatting."
# Write debug file for image extraction prompt
if self.services and hasattr(self.services, 'utils') and hasattr(self.services.utils, 'writeDebugFile'):
try:
partId = part.id[:8] if part.id else "unknown"
partLabelSafe = (part.label or "image").replace(" ", "_").replace("/", "_").replace("\\", "_")[:30]
debugPrefix = f"extraction_image_{partId}_{partLabelSafe}"
self.services.utils.writeDebugFile(extractionPrompt, f"{debugPrefix}_prompt")
logger.debug(f"Wrote image extraction prompt debug file: {debugPrefix}_prompt")
except Exception as debugError:
logger.warning(f"Failed to write image extraction debug file: {str(debugError)}")
# Call Vision AI to extract text from image # Call Vision AI to extract text from image
visionRequest = AiCallRequest( visionRequest = AiCallRequest(
prompt=extractionPrompt, prompt=extractionPrompt,
@ -606,6 +617,18 @@ class StructureFiller:
visionResponse = await self.aiService.callAi(visionRequest) visionResponse = await self.aiService.callAi(visionRequest)
# Write debug file for image extraction response
if self.services and hasattr(self.services, 'utils') and hasattr(self.services.utils, 'writeDebugFile'):
try:
partId = part.id[:8] if part.id else "unknown"
partLabelSafe = (part.label or "image").replace(" ", "_").replace("/", "_").replace("\\", "_")[:30]
debugPrefix = f"extraction_image_{partId}_{partLabelSafe}"
responseContent = visionResponse.content if visionResponse and visionResponse.content else ""
self.services.utils.writeDebugFile(responseContent, f"{debugPrefix}_response")
logger.debug(f"Wrote image extraction response debug file: {debugPrefix}_response")
except Exception as debugError:
logger.warning(f"Failed to write image extraction response debug file: {str(debugError)}")
if visionResponse and visionResponse.content: if visionResponse and visionResponse.content:
# Create text part with extracted content # Create text part with extracted content
textPart = ContentPart( textPart = ContentPart(
@ -1573,7 +1596,7 @@ The JSON should be a fragment that can be merged with the previous response."""
Flattening: Konvertiert Chapters zu finaler Section-Struktur. Flattening: Konvertiert Chapters zu finaler Section-Struktur.
Jedes Chapter wird zu einer Heading-Section (Level 1) + dessen Sections. Jedes Chapter wird zu einer Heading-Section (Level 1) + dessen Sections.
IMPORTANT: Chapters are the main structure elements (heading level 1). Chapters are the main structure elements (heading level 1).
All section headings with level < 2 are adjusted to level 2. All section headings with level < 2 are adjusted to level 2.
""" """
result = { result = {
@ -1674,7 +1697,7 @@ GENERATION HINT: {generationHint}
NOTE: Chapter already has a heading section. Do NOT generate a heading for the chapter title. NOTE: Chapter already has a heading section. Do NOT generate a heading for the chapter title.
IMPORTANT - SECTION INDEPENDENCE: ## SECTION INDEPENDENCE
- Each section is independent and self-contained - Each section is independent and self-contained
- One section does NOT have information about another section - One section does NOT have information about another section
- Each section must provide its own context and be understandable alone - Each section must provide its own context and be understandable alone
@ -1688,7 +1711,7 @@ useAiCall RULES:
- useAiCall: true ONLY if ContentPart Format is "extracted" AND transformation needed - useAiCall: true ONLY if ContentPart Format is "extracted" AND transformation needed
- useAiCall: false if Format is "object" or "reference" (direct insertion) - useAiCall: false if Format is "object" or "reference" (direct insertion)
- useAiCall: false if Format is "extracted" AND simple "include full text" instruction - useAiCall: false if Format is "extracted" AND simple "include full text" instruction
- useAiCall: true if NO ContentPartIds provided (content must be generated from scratch); Sections without ContentParts MUST have a clear, detailed generationHint explaining what content to generate - useAiCall: true if no ContentPartIds provided (content must be generated from scratch); Sections without ContentParts must have a clear, detailed generationHint explaining what content to generate
RETURN JSON: RETURN JSON:
{{ {{
@ -1714,7 +1737,7 @@ EXAMPLES (all content types):
- reference: {{"id": "s7", "content_type": "paragraph", "contentPartIds": ["ref_1"], "generationHint": "Reference", "useAiCall": false, "elements": []}} - reference: {{"id": "s7", "content_type": "paragraph", "contentPartIds": ["ref_1"], "generationHint": "Reference", "useAiCall": false, "elements": []}}
- NO CONTENT PARTS (generate from scratch): {{"id": "s8", "content_type": "paragraph", "contentPartIds": [], "generationHint": "Write a detailed professional paragraph explaining [specific topic or purpose]. Include [key points to cover]. Address [important aspects]. Conclude with [summary or recommendations].", "useAiCall": true, "elements": []}} - NO CONTENT PARTS (generate from scratch): {{"id": "s8", "content_type": "paragraph", "contentPartIds": [], "generationHint": "Write a detailed professional paragraph explaining [specific topic or purpose]. Include [key points to cover]. Address [important aspects]. Conclude with [summary or recommendations].", "useAiCall": true, "elements": []}}
CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside the JSON. Return only valid JSON. Do not include any explanatory text outside the JSON.
""" """
return prompt return prompt
@ -1752,19 +1775,9 @@ CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside th
# Baue ContentParts-Beschreibung # Baue ContentParts-Beschreibung
contentPartsText = "" contentPartsText = ""
if isAggregation: if isAggregation:
# Aggregation: Zeige nur Metadaten, nicht Previews # Aggregation: ContentParts werden als Parameter übergeben, keine IDs im Prompt nötig
contentPartsText += f"\n## CONTENT PARTS (Aggregation)\n" # Keine ContentPart-Beschreibung nötig - Daten sind bereits im Context verfügbar
contentPartsText += f"- Anzahl: {len(validParts)} ContentParts\n" contentPartsText = ""
contentPartsText += f"- Alle ContentParts werden als Parameter übergeben (nicht im Prompt!)\n"
contentPartsText += f"- Jeder Part kann sehr groß sein → Chunking automatisch\n"
contentPartsText += f"- WICHTIG: Aggregiere ALLE Parts zu einem Element (z.B. eine Tabelle)\n\n"
contentPartsText += f"ContentPart IDs:\n"
for part in validParts:
contentFormat = part.metadata.get("contentFormat", "unknown")
contentPartsText += f" - {part.id} (Format: {contentFormat}, Type: {part.typeGroup}"
if part.metadata.get("originalFileName"):
contentPartsText += f", Source: {part.metadata.get('originalFileName')}"
contentPartsText += ")\n"
else: else:
# Einzelverarbeitung: Zeige Previews # Einzelverarbeitung: Zeige Previews
for part in validParts: for part in validParts:
@ -1838,6 +1851,9 @@ CRITICAL: Return ONLY valid JSON. Do not include any explanatory text outside th
if isAggregation: if isAggregation:
prompt = f"""# TASK: Generate Section Content (Aggregation) prompt = f"""# TASK: Generate Section Content (Aggregation)
Return only valid JSON. No explanatory text, no comments, no markdown formatting outside JSON.
If ContentParts have no data, return: {{"elements": [{{"type": "{contentType}", "content": {{"headers": [], "rows": []}}}}]}}
LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}. LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
## SECTION METADATA ## SECTION METADATA
@ -1845,22 +1861,14 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
- Content Type: {contentType} - Content Type: {contentType}
- Generation Hint: {generationHint} - Generation Hint: {generationHint}
## AVAILABLE CONTENT FOR THIS SECTION
{contentPartsText if contentPartsText else "(No content parts specified for this section)"}
## INSTRUCTIONS ## INSTRUCTIONS
1. Generate content for section "{sectionId}" based on the generation hint above 1. Extract all data from the context provided. Do not skip or omit any data.
2. **AGGREGATION**: Combine ALL provided ContentParts into ONE element (e.g., one table with all data) 2. Extract data only from the provided context. Never invent, create, or generate data that is not in the context.
3. For table content_type: Create a single table with headers and rows from all ContentParts 3. If the context contains no data, return empty structures (empty rows array for tables).
4. For bullet_list content_type: Create a single list with items from all ContentParts 4. Aggregate all data into one element (e.g., one table).
5. Format appropriately based on content_type ({contentType}) 5. For table: Extract all rows from the context. Return {{"headers": [...], "rows": []}} only if no data exists.
6. Ensure the generated content is self-contained and understandable independently 6. Format based on content_type ({contentType}).
7. Return ONLY a JSON object with an "elements" array 7. No HTML/styling: Plain text only, no markup.
8. Each element should match the content_type: {contentType}
9. CRITICAL - NO HTML/STYLING: Do NOT include HTML tags, CSS styles, or any formatting markup in text content. Return plain text only. Formatting is handled automatically by the renderer.
10. For paragraphs: Return plain text only, no HTML tags like <div>, <span>, <p>, or style attributes
11. For headings: Return plain text only, no HTML tags or styling
12. For images: Do NOT include base64 data in JSON - images are handled separately
## OUTPUT FORMAT ## OUTPUT FORMAT
Return a JSON object with this structure: Return a JSON object with this structure:
@ -1874,16 +1882,19 @@ Return a JSON object with this structure:
] ]
}} }}
CRITICAL: Output requirements:
- "content" MUST always be an object (never a string) - "content" must be an object (never a string)
- For text content: Return plain text only, NO HTML tags, NO CSS styles, NO formatting markup - Return only valid JSON - no text before, no text after, no comments, no explanations
- Return ONLY valid JSON. Do not include any explanatory text outside the JSON. - No invented data: Return empty structures if ContentParts have no data
- Extract all data: Process every ContentPart completely and include all extracted data
## CONTEXT (for reference only) ## USER REQUEST (for context)
{contextText if contextText else ""}
``` ```
{userPrompt} {userPrompt}
``` ```
## CONTEXT
{contextText if contextText else ""}
""" """
else: else:
prompt = f"""# TASK: Generate Section Content prompt = f"""# TASK: Generate Section Content
@ -1899,16 +1910,11 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles,
{contentPartsText if contentPartsText else "(No content parts specified for this section)"} {contentPartsText if contentPartsText else "(No content parts specified for this section)"}
## INSTRUCTIONS ## INSTRUCTIONS
1. Generate content for section "{sectionId}" based on the generation hint above 1. Extract data only from provided ContentParts. Never invent or generate data.
2. Use the available content parts to populate this section 2. If ContentParts contain no data, return empty structures (empty rows array for tables).
3. For extracted text: Format appropriately based on content_type ({contentType}) 3. Format based on content_type ({contentType}).
4. Ensure the generated content is self-contained and understandable independently 4. Return only valid JSON with "elements" array.
5. Return ONLY a JSON object with an "elements" array 5. No HTML/styling: Plain text only, no markup.
6. Each element should match the content_type: {contentType}
7. CRITICAL - NO HTML/STYLING: Do NOT include HTML tags, CSS styles, or any formatting markup in text content. Return plain text only. Formatting is handled automatically by the renderer.
8. For paragraphs: Return plain text only, no HTML tags like <div>, <span>, <p>, or style attributes
9. For headings: Return plain text only, no HTML tags or styling
10. For images: If you need to reference an image, describe it in altText. Do NOT include base64 data - images are handled separately
## OUTPUT FORMAT ## OUTPUT FORMAT
Return a JSON object with this structure: Return a JSON object with this structure:
@ -1922,16 +1928,18 @@ Return a JSON object with this structure:
] ]
}} }}
CRITICAL: Output requirements:
- "content" MUST always be an object (never a string) - "content" must be an object (never a string)
- For text content: Return plain text only, NO HTML tags, NO CSS styles, NO formatting markup - Return only valid JSON, no explanatory text
- Return ONLY valid JSON. Do not include any explanatory text outside the JSON - No invented data: Return empty structures if ContentParts have no data
## CONTEXT (for reference only) ## USER REQUEST
{contextText if contextText else ""}
``` ```
{userPrompt} {userPrompt}
``` ```
## CONTEXT
{contextText if contextText else ""}
""" """
return prompt return prompt

View file

@ -250,44 +250,45 @@ Continue generating the remaining chapters now.
language = self._getUserLanguage() language = self._getUserLanguage()
logger.debug(f"Using language from services (user intention analysis) for structure generation: {language}") logger.debug(f"Using language from services (user intention analysis) for structure generation: {language}")
prompt = f"""CRITICAL OUTPUT REQUIREMENT: This is a PLANNING task, not a generation task. You MUST return EXACTLY ONE complete JSON object. Do NOT generate multiple JSON objects, alternatives, or variations. Do NOT use separators like "---" between JSON objects. Return the single best JSON structure that matches the requirements below. prompt = f"""# TASK: Generate Chapter Structure
USER REQUEST (for context): This is a PLANNING task. Return EXACTLY ONE complete JSON object. Do not generate multiple JSON objects, alternatives, or variations. Do not use separators like "---" between JSON objects.
## USER REQUEST (for context)
``` ```
{userPrompt} {userPrompt}
``` ```
LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}. LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}.
AVAILABLE CONTENT PARTS: ## AVAILABLE CONTENT PARTS
{contentPartsIndex} {contentPartsIndex}
TASK: Generate Chapter Structure for the documents to be generated. ## CHAPTER INDEPENDENCE
IMPORTANT - CHAPTER INDEPENDENCE:
- Each chapter is independent and self-contained - Each chapter is independent and self-contained
- One chapter does NOT have information about another chapter - One chapter does NOT have information about another chapter
- Each chapter must provide its own context and be understandable alone - Each chapter must provide its own context and be understandable alone
CONTENT ASSIGNMENT: ## CONTENT ASSIGNMENT
- Assign ContentParts to chapters via contentParts object - Assign ContentParts to chapters via contentParts object
- For data extraction, the type of a contentPart (image, text, etc.) is NOT relevant - only what is specified in the instruction matters - For data extraction, the type of a contentPart (image, text, etc.) is not relevant - only what is specified in the instruction matters
- Include ALL relevant parts from same source when needed for structured data extraction - Include all relevant parts from same source when needed for structured data extraction
- Each contentPart can have either: - Each contentPart can have either:
- "instruction": For AI extraction prompts (how to process/extract from this part) - "instruction": For AI extraction prompts (how to process/extract from this part)
- "caption": For user-facing presentation (how to display/reference this part in the document) - "caption": For user-facing presentation (how to display/reference this part in the document)
- Both can be present if needed - Both can be present if needed
- Chapters without contentParts can only generate generic content (not document-specific) - Chapters without contentParts can only generate generic content (not document-specific)
FORMATTING: ## FORMATTING
- Formatting is handled automatically - focus on CONTENT and STRUCTURE only - Formatting is handled automatically - focus on content and structure only
CHAPTER STRUCTURE: ## CHAPTER STRUCTURE
- chapter id, level (1, 2, 3, etc.), title - chapter id, level (1, 2, 3, etc.), title
- contentParts: {{"partId": {{"instruction": "..."}} or {{"caption": "..."}} or both}} - Compact mapping of part IDs to their extraction instructions and/or presentation captions - contentParts: {{"partId": {{"instruction": "..."}} or {{"caption": "..."}} or both}} - Compact mapping of part IDs to their extraction instructions and/or presentation captions
- generationHint: Self-contained description (if contentParts is empty, must be VERY DETAILED) - generationHint: Self-contained description that reflects the user's intent for the specific data. If contentParts is empty, must be detailed. If contentParts are present, the hint should guide how to extract and structure the data according to the user's requirements (e.g., specific columns, format, structure)
RETURN JSON: ## OUTPUT FORMAT
Return JSON:
{{ {{
"metadata": {{ "metadata": {{
"title": "Document Title", "title": "Document Title",

View file

@ -652,6 +652,18 @@ class ExtractionService:
mergeType="concatenate" mergeType="concatenate"
) )
# Check if this is an elements response format (elements array structure)
# This is used for section content generation where multiple ContentParts are processed
isElementsResponse = self._isElementsResponse(content_parts)
if isElementsResponse:
# Merge JSON elements responses intelligently (merge tables, combine elements)
logger.info(f"Detected 'elements' JSON response format - merging {len(content_parts)} JSON responses")
merged_json = self._mergeElementsResponses(content_parts)
merged_json_str = json.dumps(merged_json, indent=2, ensure_ascii=False)
logger.info(f"Successfully merged 'elements' JSON responses into single unified JSON ({len(merged_json_str)} chars)")
return merged_json_str
# Check if this is a JSON extraction response format (extracted_content structure) # Check if this is a JSON extraction response format (extracted_content structure)
# If so, merge JSON structures intelligently before applying regular merging # If so, merge JSON structures intelligently before applying regular merging
isJsonExtractionResponse = self._isJsonExtractionResponse(content_parts) isJsonExtractionResponse = self._isJsonExtractionResponse(content_parts)
@ -736,11 +748,112 @@ class ExtractionService:
return False return False
def _isElementsResponse(self, content_parts: List[ContentPart]) -> bool:
"""Check if contentParts contain JSON responses with an 'elements' array (e.g., section content)."""
if not content_parts:
return False
firstPartData = content_parts[0].data if content_parts[0].data else ""
if not isinstance(firstPartData, str):
return False
strippedData = stripCodeFences(firstPartData.strip())
if not strippedData.startswith(('{', '[')):
return False
try:
parsed = json.loads(strippedData)
if isinstance(parsed, dict) and "elements" in parsed and isinstance(parsed["elements"], list):
return True
except:
pass
return False
def _mergeElementsResponses(self, content_parts: List[ContentPart]) -> Dict[str, Any]:
"""Merge multiple JSON responses with an 'elements' array into one unified response.
Specifically designed to merge tables within the 'elements' array.
Empty tables (no rows) are ignored if a table with the same headers already has data.
"""
merged_elements = []
table_headers_map: Dict[str, List[Dict[str, Any]]] = {} # headers_tuple -> [table_contents]
for part in content_parts:
if not part.data:
continue
# Handle multiple JSON blocks in a single response (separated by ---)
partDataBlocks = part.data.split('---')
for blockData in partDataBlocks:
if not blockData.strip():
continue
try:
strippedData = stripCodeFences(blockData.strip())
if not strippedData:
continue
parsed = json.loads(strippedData)
if isinstance(parsed, dict) and "elements" in parsed and isinstance(parsed["elements"], list):
for element in parsed["elements"]:
if isinstance(element, dict) and element.get("type") == "table" and "content" in element:
table_content = element["content"]
headers = table_content.get("headers", [])
rows = table_content.get("rows", [])
if headers:
headers_key = tuple(headers)
# If table has no rows, only add it if no table with these headers exists yet
if not rows:
if headers_key not in table_headers_map:
# No table with these headers exists - keep empty table for now
table_headers_map[headers_key] = []
# If a table with these headers already exists (with or without data), skip empty table
continue
# Table has rows - add to merge map
if headers_key not in table_headers_map:
table_headers_map[headers_key] = []
table_headers_map[headers_key].append(table_content)
else:
# Keep non-table elements as is, but avoid duplicates if possible
if element not in merged_elements:
merged_elements.append(element)
except Exception as e:
logger.warning(f"Failed to parse JSON elements response from part {part.id}: {str(e)}")
continue
# Merge tables by headers - combine rows from tables with same headers
for headers_key, tables in table_headers_map.items():
if not tables:
# Only empty tables with these headers - skip them
continue
all_rows = []
for table_content in tables:
rows = table_content.get("rows", [])
all_rows.extend(rows)
# Only add table if it has rows
if all_rows:
merged_elements.append({
"type": "table",
"content": {
"headers": list(headers_key),
"rows": all_rows
}
})
return {"elements": merged_elements}
def _mergeJsonExtractionResponses(self, content_parts: List[ContentPart], originalContentParts: Optional[List[ContentPart]] = None) -> Dict[str, Any]: def _mergeJsonExtractionResponses(self, content_parts: List[ContentPart], originalContentParts: Optional[List[ContentPart]] = None) -> Dict[str, Any]:
"""Merge multiple JSON extraction responses into one unified response. """Merge multiple JSON extraction responses into one unified response.
Merges: Merges:
- Tables: Combines all table rows, preserves headers, removes duplicates - Tables: Combines all table rows, preserves headers (duplicates preserved)
- Text: Combines all text blocks - Text: Combines all text blocks
- Headings: Combines all headings arrays - Headings: Combines all headings arrays
- Lists: Combines all list items - Lists: Combines all list items
@ -927,16 +1040,10 @@ class ExtractionService:
for headers_key, tables in table_headers_map.items(): for headers_key, tables in table_headers_map.items():
# Collect all rows from tables with same headers # Collect all rows from tables with same headers
all_rows = [] all_rows = []
seen_rows = set() # Track duplicates using tuple representation
for table in tables: for table in tables:
rows = table.get("rows", []) rows = table.get("rows", [])
for row in rows: all_rows.extend(rows)
# Convert row to tuple for duplicate detection
row_tuple = tuple(str(cell) for cell in row)
if row_tuple not in seen_rows:
seen_rows.add(row_tuple)
all_rows.append(row)
# Create merged table # Create merged table
if all_rows: if all_rows:
@ -1450,6 +1557,32 @@ class ExtractionService:
contentPart, prompt, options, failoverModelList, aiObjects, None # Don't pass progressCallback to avoid double logging contentPart, prompt, options, failoverModelList, aiObjects, None # Don't pass progressCallback to avoid double logging
) )
# Write debug files for generation phase (section content generation)
# Check for DATA_GENERATE or DATA_ANALYSE (used for section generation)
isGenerationPhase = False
if options and hasattr(options, 'operationType'):
isGenerationPhase = (options.operationType == OperationTypeEnum.DATA_GENERATE or
options.operationType == OperationTypeEnum.DATA_ANALYSE)
if isGenerationPhase:
if self.services and hasattr(self.services, 'utils') and hasattr(self.services.utils, 'writeDebugFile'):
try:
# Create debug filename with contentPart ID or label
partId = contentPart.id[:8] if contentPart.id else f"part_{partIndex+1}"
partLabelSafe = (contentPart.label or f"part_{partIndex+1}").replace(" ", "_").replace("/", "_").replace("\\", "_")[:30]
debugPrefix = f"generation_contentPart_{partId}_{partLabelSafe}"
# Write prompt
self.services.utils.writeDebugFile(prompt, f"{debugPrefix}_prompt")
# Write response
responseContent = partResult.content if partResult.content else ""
self.services.utils.writeDebugFile(responseContent, f"{debugPrefix}_response")
logger.debug(f"Wrote debug files for contentPart {partId} (generation): {debugPrefix}_prompt, {debugPrefix}_response")
except Exception as debugError:
logger.warning(f"Failed to write debug file for contentPart {contentPart.id}: {str(debugError)}")
# Update completed count and log progress # Update completed count and log progress
completedCount[0] += 1 completedCount[0] += 1
if progressCallback: if progressCallback: