fixed vision for pre-extracted content

This commit is contained in:
ValueOn AG 2025-12-25 00:34:45 +01:00
parent 23bb1ff5d3
commit a2315d6ace

View file

@ -1135,7 +1135,8 @@ If no trackable items can be identified, return: {{"kpis": []}}
fileName=preExtracted["originalDocument"]["fileName"],
mimeType=preExtracted["originalDocument"]["mimeType"],
fileSize=preExtracted["originalDocument"].get("fileSize", doc.fileSize),
fileId=doc.fileId # Behalte fileId vom JSON
fileId=doc.fileId, # Behalte fileId vom JSON
messageId=doc.messageId if hasattr(doc, 'messageId') else None # Behalte messageId falls vorhanden
)
resolvedDocuments.append(originalDoc)
else:
@ -1264,6 +1265,39 @@ If no trackable items can be identified, return: {{"kpis": []}}
logger.debug(f"Error resolving pre-extracted document {document.fileName}: {str(e)}")
return None
async def _extractTextFromImage(self, imagePart: ContentPart, extractionPrompt: str) -> Optional[str]:
"""
Extrahiere Text aus einem Image-Part mit Vision AI.
Args:
imagePart: ContentPart mit typeGroup="image"
extractionPrompt: Prompt für die Text-Extraktion
Returns:
Extrahierter Text oder None bei Fehler
"""
try:
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions
# Erstelle AI-Call-Request mit Image-Part
request = AiCallRequest(
prompt=extractionPrompt or "Extract all text content from this image. Return only the extracted text, no additional formatting.",
context="",
options=AiCallOptions(operationType="extraction"),
contentParts=[imagePart]
)
# Verwende AI-Service für Vision AI-Verarbeitung
response = await self.services.ai.call(request)
if response and response.content:
return response.content.strip()
return None
except Exception as e:
logger.warning(f"Error extracting text from image {imagePart.id}: {str(e)}")
return None
def _buildIntentAnalysisPrompt(
self,
userPrompt: str,
@ -1420,12 +1454,22 @@ Return ONLY valid JSON following the structure above.
# Wenn Intent "render" für Images hat, erstelle auch object Part
if "render" in partIntent and part.typeGroup == "image" and part.data:
# Image-Part mit render Intent: Erstelle sowohl extracted als auch object Part
# 1. Extracted Part (bereits vorhanden)
part.metadata["intent"] = "extract"
part.metadata["fromExtractContent"] = True
part.metadata["skipExtraction"] = True
part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"]
allContentParts.append(part)
# 1. Extracted Part - prüfe ob "extract" Intent vorhanden ist
if "extract" in partIntent:
# Image hat sowohl extract als auch render Intent
# Extracted Part: Wird mit Vision AI verarbeitet (skipExtraction=False)
part.metadata["intent"] = "extract"
part.metadata["fromExtractContent"] = True
part.metadata["skipExtraction"] = False # WICHTIG: Vision AI-Verarbeitung nötig!
part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"]
allContentParts.append(part)
else:
# Nur render Intent - kein Text-Extraktion nötig
part.metadata["intent"] = "render"
part.metadata["fromExtractContent"] = True
part.metadata["skipExtraction"] = True
part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"]
allContentParts.append(part)
# 2. Object Part für Rendering (base64 data ist bereits im extracted Part)
objectPart = ContentPart(
@ -1444,11 +1488,51 @@ Return ONLY valid JSON following the structure above.
}
)
allContentParts.append(objectPart)
elif part.typeGroup == "image" and "extract" in partIntent:
# Image mit extract Intent: Vision AI-Verarbeitung nötig
# Verarbeite Image mit Vision AI, um Text zu extrahieren
try:
extractedText = await self._extractTextFromImage(part, intent.extractionPrompt if intent else "Extract all text content from this image")
if extractedText:
# Erstelle neuen Text-Part mit extrahiertem Text
textPart = ContentPart(
id=f"extracted_{part.id}",
label=f"Extracted text from {part.label or 'Image'}",
typeGroup="text",
mimeType="text/plain",
data=extractedText,
metadata={
"contentFormat": "extracted",
"documentId": document.id,
"intent": "extract",
"originalFileName": preExtracted["originalDocument"]["fileName"],
"relatedImagePartId": part.id,
"extractionPrompt": intent.extractionPrompt if intent else "Extract all text content from this image"
}
)
allContentParts.append(textPart)
logger.info(f"✅ Extracted text from image {part.id} using Vision AI")
# Wenn auch render Intent vorhanden, füge Image-Part hinzu
if "render" in partIntent:
part.metadata["intent"] = "render"
part.metadata["fromExtractContent"] = True
part.metadata["skipExtraction"] = True
part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"]
allContentParts.append(part)
except Exception as e:
logger.warning(f"Failed to extract text from image {part.id}: {str(e)}, adding image as-is")
# Fallback: Füge Image-Part hinzu ohne Text-Extraktion
part.metadata["intent"] = "extract"
part.metadata["fromExtractContent"] = True
part.metadata["skipExtraction"] = False
part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"]
allContentParts.append(part)
else:
# Normales extracted Part
# Normales extracted Part (kein Image oder kein extract Intent)
part.metadata["intent"] = partIntent[0] if partIntent else "extract"
part.metadata["fromExtractContent"] = True
part.metadata["skipExtraction"] = True
part.metadata["skipExtraction"] = True # Bereits extrahiert
part.metadata["originalFileName"] = preExtracted["originalDocument"]["fileName"]
allContentParts.append(part)