From a2315d6ace26832ad225db782d03904d6d5f91f5 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Thu, 25 Dec 2025 00:34:45 +0100
Subject: [PATCH] fixed vision for pre-extracted content
---
modules/services/serviceAi/mainServiceAi.py | 102 ++++++++++++++++++--
1 file changed, 93 insertions(+), 9 deletions(-)
diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index 331a3289..74b90346 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -1135,7 +1135,8 @@ If no trackable items can be identified, return: {{"kpis": []}}
fileName=preExtracted["originalDocument"]["fileName"],
mimeType=preExtracted["originalDocument"]["mimeType"],
fileSize=preExtracted["originalDocument"].get("fileSize", doc.fileSize),
- fileId=doc.fileId # Behalte fileId vom JSON
+ fileId=doc.fileId, # Behalte fileId vom JSON
+ messageId=doc.messageId if hasattr(doc, 'messageId') else None # Behalte messageId falls vorhanden
)
resolvedDocuments.append(originalDoc)
else:
@@ -1264,6 +1265,39 @@ If no trackable items can be identified, return: {{"kpis": []}}
logger.debug(f"Error resolving pre-extracted document {document.fileName}: {str(e)}")
return None
+ async def _extractTextFromImage(self, imagePart: ContentPart, extractionPrompt: str) -> Optional[str]:
+ """
+ Extrahiere Text aus einem Image-Part mit Vision AI.
+
+ Args:
+ imagePart: ContentPart mit typeGroup="image"
+ extractionPrompt: Prompt für die Text-Extraktion
+
+ Returns:
+ Extrahierter Text oder None bei Fehler
+ """
+ try:
+ from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions
+
+ # Erstelle AI-Call-Request mit Image-Part
+ request = AiCallRequest(
+ prompt=extractionPrompt or "Extract all text content from this image. Return only the extracted text, no additional formatting.",
+ context="",
+ options=AiCallOptions(operationType="extraction"),
+ contentParts=[imagePart]
+ )
+
+ # Verwende AI-Service für Vision AI-Verarbeitung
+ response = await self.services.ai.call(request)
+
+ if response and response.content:
+ return response.content.strip()
+
+ return None
+ except Exception as e:
+ logger.warning(f"Error extracting text from image {imagePart.id}: {str(e)}")
+ return None
+
def _buildIntentAnalysisPrompt(
self,
userPrompt: str,
@@ -1420,12 +1454,22 @@ Return ONLY valid JSON following the structure above.
# Wenn Intent "render" für Images hat, erstelle auch object Part
if "render" in partIntent and part.typeGroup == "image" and part.data:
# Image-Part mit render Intent: Erstelle sowohl extracted als auch object Part
- # 1. Extracted Part (bereits vorhanden)
- part.metadata["intent"] = "extract"
- part.metadata["fromExtractContent"] = True
- part.metadata["skipExtraction"] = True
- part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"]
- allContentParts.append(part)
+ # 1. Extracted Part - prüfe ob "extract" Intent vorhanden ist
+ if "extract" in partIntent:
+ # Image hat sowohl extract als auch render Intent
+ # Extracted Part: Wird mit Vision AI verarbeitet (skipExtraction=False)
+ part.metadata["intent"] = "extract"
+ part.metadata["fromExtractContent"] = True
+ part.metadata["skipExtraction"] = False # WICHTIG: Vision AI-Verarbeitung nötig!
+ part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"]
+ allContentParts.append(part)
+ else:
+ # Nur render Intent - kein Text-Extraktion nötig
+ part.metadata["intent"] = "render"
+ part.metadata["fromExtractContent"] = True
+ part.metadata["skipExtraction"] = True
+ part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"]
+ allContentParts.append(part)
# 2. Object Part für Rendering (base64 data ist bereits im extracted Part)
objectPart = ContentPart(
@@ -1444,11 +1488,51 @@ Return ONLY valid JSON following the structure above.
}
)
allContentParts.append(objectPart)
+ elif part.typeGroup == "image" and "extract" in partIntent:
+ # Image mit extract Intent: Vision AI-Verarbeitung nötig
+ # Verarbeite Image mit Vision AI, um Text zu extrahieren
+ try:
+ extractedText = await self._extractTextFromImage(part, intent.extractionPrompt if intent else "Extract all text content from this image")
+ if extractedText:
+ # Erstelle neuen Text-Part mit extrahiertem Text
+ textPart = ContentPart(
+ id=f"extracted_{part.id}",
+ label=f"Extracted text from {part.label or 'Image'}",
+ typeGroup="text",
+ mimeType="text/plain",
+ data=extractedText,
+ metadata={
+ "contentFormat": "extracted",
+ "documentId": document.id,
+ "intent": "extract",
+ "originalFileName": preExtracted["originalDocument"]["fileName"],
+ "relatedImagePartId": part.id,
+ "extractionPrompt": intent.extractionPrompt if intent else "Extract all text content from this image"
+ }
+ )
+ allContentParts.append(textPart)
+ logger.info(f"✅ Extracted text from image {part.id} using Vision AI")
+
+ # Wenn auch render Intent vorhanden, füge Image-Part hinzu
+ if "render" in partIntent:
+ part.metadata["intent"] = "render"
+ part.metadata["fromExtractContent"] = True
+ part.metadata["skipExtraction"] = True
+ part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"]
+ allContentParts.append(part)
+ except Exception as e:
+ logger.warning(f"Failed to extract text from image {part.id}: {str(e)}, adding image as-is")
+ # Fallback: Füge Image-Part hinzu ohne Text-Extraktion
+ part.metadata["intent"] = "extract"
+ part.metadata["fromExtractContent"] = True
+ part.metadata["skipExtraction"] = False
+ part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"]
+ allContentParts.append(part)
else:
- # Normales extracted Part
+ # Normales extracted Part (kein Image oder kein extract Intent)
part.metadata["intent"] = partIntent[0] if partIntent else "extract"
part.metadata["fromExtractContent"] = True
- part.metadata["skipExtraction"] = True
+ part.metadata["skipExtraction"] = True # Bereits extrahiert
part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"]
allContentParts.append(part)