From 501cebe3420d7baa4128132117c1e59e2cdf58d8 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Tue, 30 Sep 2025 18:30:33 +0200
Subject: [PATCH] start testing with backend running
---
modules/datamodels/datamodelExtraction.py | 28 +-
modules/services/__init__.py | 13 +-
modules/services/serviceAi/mainServiceAi.py | 22 +-
.../mainServiceDocumentExtraction.py | 2054 -----------------
.../chunking/structure_chunker.py | 2 +-
.../chunking/table_chunker.py | 2 +-
.../chunking/text_chunker.py | 2 +-
.../mainServiceExtraction.py | 83 +-
.../services/serviceExtraction/subRegistry.py | 2 +-
.../mainServiceGeneration.py} | 123 +-
.../subDocumentUtility.py | 0
.../serviceWorkflow/mainServiceWorkflow.py | 115 +-
modules/services/test_all_services.py | 226 ++
modules/workflows/methods/methodAi.py | 115 +-
modules/workflows/methods/methodDocument.py | 123 +-
modules/workflows/methods/methodOutlook.py | 74 +-
modules/workflows/methods/methodSharepoint.py | 10 +-
modules/workflows/processing/handlingTasks.py | 43 +-
modules/workflows/processing/promptFactory.py | 2 +-
modules/workflows/workflowManager.py | 6 +-
testdata/00Untitled.jpg | Bin 0 -> 417091 bytes
testdata/Muster_Kundenliste_Test1.xlsx | Bin 0 -> 27041 bytes
testdata/diagramm_komponenten.pdf | Bin 0 -> 88241 bytes
23 files changed, 618 insertions(+), 2427 deletions(-)
delete mode 100644 modules/services/serviceDocument/mainServiceDocumentExtraction.py
rename modules/services/{serviceDocument/mainServiceDocumentGeneration.py => serviceGeneration/mainServiceGeneration.py} (59%)
rename modules/services/{serviceDocument => serviceGeneration}/subDocumentUtility.py (100%)
create mode 100644 modules/services/test_all_services.py
create mode 100644 testdata/00Untitled.jpg
create mode 100644 testdata/Muster_Kundenliste_Test1.xlsx
create mode 100644 testdata/diagramm_komponenten.pdf
diff --git a/modules/datamodels/datamodelExtraction.py b/modules/datamodels/datamodelExtraction.py
index 78ce657e..61d12977 100644
--- a/modules/datamodels/datamodelExtraction.py
+++ b/modules/datamodels/datamodelExtraction.py
@@ -1,21 +1,19 @@
from typing import Any, Dict, List, Optional
-from dataclasses import dataclass, field
+from pydantic import BaseModel, Field
-@dataclass
-class ContentPart:
- id: str
- parentId: Optional[str]
- label: str
- typeGroup: str
- mimeType: str
- data: str
- metadata: Dict[str, Any] = field(default_factory=dict)
+class ContentPart(BaseModel):
+ id: str = Field(description="Unique content part identifier")
+ parentId: Optional[str] = Field(default=None, description="Optional parent content part id")
+ label: str = Field(description="Human readable label of the part")
+ typeGroup: str = Field(description="Logical type group: text, table, structure, binary, ...")
+ mimeType: str = Field(description="MIME type of the part payload")
+ data: str = Field(default="", description="Primary data payload, often extracted text")
+ metadata: Dict[str, Any] = Field(default_factory=dict, description="Arbitrary metadata for the part")
-@dataclass
-class ExtractedContent:
- id: str
- parts: List[ContentPart]
- summary: Optional[Dict[str, Any]] = None
+class ExtractedContent(BaseModel):
+ id: str = Field(description="Extraction id or source document id")
+ parts: List[ContentPart] = Field(default_factory=list, description="List of extracted parts")
+ summary: Optional[Dict[str, Any]] = Field(default=None, description="Optional extraction summary")
diff --git a/modules/services/__init__.py b/modules/services/__init__.py
index a8882a2c..8f7843e5 100644
--- a/modules/services/__init__.py
+++ b/modules/services/__init__.py
@@ -55,11 +55,11 @@ class Services:
# Initialize service packages
- from .serviceDocument.mainServiceDocumentExtraction import DocumentExtractionService
- self.documentExtraction = PublicService(DocumentExtractionService(self))
+ from .serviceExtraction.mainServiceExtraction import ExtractionService
+ self.extraction = PublicService(ExtractionService(self))
- from .serviceDocument.mainServiceDocumentGeneration import DocumentGenerationService
- self.documentGeneration = PublicService(DocumentGenerationService(self))
+ from .serviceGeneration.mainServiceGeneration import GenerationService
+ self.generation = PublicService(GenerationService(self))
from .serviceNeutralization.mainServiceNeutralization import NeutralizationService
self.neutralization = PublicService(NeutralizationService(self))
@@ -76,14 +76,9 @@ class Services:
from .serviceWorkflow.mainServiceWorkflow import WorkflowService
self.workflow = PublicService(WorkflowService(self))
- from .serviceWeb.mainServiceWeb import WebService
- self.web = PublicService(WebService(self))
-
from .serviceUtils.mainServiceUtils import UtilsService
self.utils = PublicService(UtilsService(self))
- async def extractContentFromDocument(self, prompt, document):
- return await self.services.documentExtraction.extractContentFromDocument(prompt, document)
def getInterface(user: User, workflow: ChatWorkflow) -> Services:
return Services(user, workflow)
diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index be0b0bad..9d1e4735 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -196,7 +196,7 @@ class AiService:
processedContents: List[str] = []
try:
- extractionResult = self.extractionService.extractDocuments(documentList, extractionOptions)
+ extractionResult = self.extractionService.extractContent(documentList, extractionOptions)
def _partsToText(parts) -> str:
lines: List[str] = []
@@ -205,7 +205,7 @@ class AiService:
lines.append(p.data)
return "\n\n".join(lines)
- if processIndividually and isinstance(extractionResult, list):
+ if isinstance(extractionResult, list):
for i, ec in enumerate(extractionResult):
try:
contentText = _partsToText(ec.parts)
@@ -216,9 +216,8 @@ class AiService:
logger.warning(f"Error aggregating extracted content: {str(e)}")
processedContents.append("[Error aggregating content]")
else:
- # pooled mode returns dict
- parts = extractionResult.get("parts", []) if isinstance(extractionResult, dict) else []
- contentText = _partsToText(parts)
+ # Fallback: no content
+ contentText = ""
if compressDocuments and len(contentText.encode("utf-8")) > 10000:
contentText = await self._compressContent(contentText, 10000, "document")
processedContents.append(contentText)
@@ -359,7 +358,7 @@ class AiService:
"mimeType": d.mimeType
} for d in documents]
- extracted_content = await self.extractionService.extractDocuments(
+ extracted_content = await self.extractionService.extractContent(
documentList=documentList,
options={
"prompt": prompt,
@@ -371,8 +370,15 @@ class AiService:
}
)
- # Get text content from extracted parts using typeGroup-aware processing
- context = self._extractTextFromContentParts(extracted_content)
+ # Build context from list of ExtractedContent
+ if isinstance(extracted_content, list):
+ context = "\n\n---\n\n".join([
+ "\n\n".join([
+ p.data for p in ec.parts if p.typeGroup in ["text", "table", "structure"] and p.data
+ ]) for ec in extracted_content
+ ])
+ else:
+ context = ""
# Check size and reduce if needed
full_prompt = prompt + "\n\n" + context if context else prompt
diff --git a/modules/services/serviceDocument/mainServiceDocumentExtraction.py b/modules/services/serviceDocument/mainServiceDocumentExtraction.py
deleted file mode 100644
index 1a658fb8..00000000
--- a/modules/services/serviceDocument/mainServiceDocumentExtraction.py
+++ /dev/null
@@ -1,2054 +0,0 @@
-from typing import Dict, Any, List, Optional, Union, Tuple, TypedDict, Callable, Awaitable
-import logging
-import json
-import os
-import io
-import base64
-from datetime import datetime, UTC
-from pathlib import Path
-import xml.etree.ElementTree as ET
-from bs4 import BeautifulSoup
-import uuid
-from modules.services.serviceDocument.subDocumentUtility import (
- getFileExtension,
- getMimeTypeFromExtension,
- detectMimeTypeFromContent,
- detectMimeTypeFromData,
- convertDocumentDataToString
-)
-
-from modules.datamodels.datamodelWorkflow import ExtractedContent
-from modules.datamodels.datamodelChat import ContentItem, ContentMetadata, ChatDocument
-from modules.services.serviceNeutralization.mainServiceNeutralization import NeutralizationService
-from modules.shared.configuration import APP_CONFIG
-from modules.services.serviceAi.mainServiceAi import AiService
-from modules.interfaces.interfaceAiObjects import AiObjects
-
-logger = logging.getLogger(__name__)
-
-# Optional imports - only loaded when needed
-pdfExtractorLoaded = False
-officeExtractorLoaded = False
-imageProcessorLoaded = False
-
-class FileProcessingError(Exception):
- """Custom exception for file processing errors."""
- pass
-
-class DocumentExtractionService:
- """Processor for handling document operations and content extraction."""
-
- def __init__(self, serviceCenter=None):
- """Initialize the document processor."""
- self._neutralizer = NeutralizationService() if APP_CONFIG.get("ENABLE_CONTENT_NEUTRALIZATION", False) else None
- self._serviceCenter = serviceCenter
- # Store service center for access to user/workflow context when needed
- self.services = None # Will be set to None to avoid circular dependency
-
- self.supportedTypes: Dict[str, Callable[[bytes, str, str], Awaitable[List[ContentItem]]]] = {
- # Text and data files
- 'text/plain': self._processText,
- 'text/csv': self._processCsv,
- 'application/json': self._processJson,
- 'application/xml': self._processXml,
- 'text/html': self._processHtml,
- 'image/svg+xml': self._processSvg,
-
- # Programming languages
- 'application/javascript': self._processText,
- 'application/typescript': self._processText,
- 'text/jsx': self._processText,
- 'text/tsx': self._processText,
- 'text/x-python': self._processText,
- 'text/x-java-source': self._processText,
- 'text/x-c': self._processText,
- 'text/x-c++src': self._processText,
- 'text/x-c++hdr': self._processText,
- 'text/x-csharp': self._processText,
- 'application/x-httpd-php': self._processText,
- 'text/x-ruby': self._processText,
- 'text/x-go': self._processText,
- 'text/x-rust': self._processText,
- 'text/x-swift': self._processText,
- 'text/x-kotlin': self._processText,
- 'text/x-scala': self._processText,
- 'text/x-r': self._processText,
- 'text/x-matlab': self._processText,
- 'text/x-perl': self._processText,
- 'application/x-sh': self._processText,
- 'application/x-powershell': self._processText,
- 'application/x-msdos-program': self._processText,
- 'text/vbscript': self._processText,
- 'text/x-lua': self._processText,
- 'application/sql': self._processText,
- 'application/dart': self._processText,
- 'text/x-elm': self._processText,
- 'text/x-clojure': self._processText,
- 'text/x-haskell': self._processText,
- 'text/x-fsharp': self._processText,
- 'text/x-ocaml': self._processText,
-
- # Web technologies
- 'text/css': self._processText,
- 'text/x-scss': self._processText,
- 'text/x-sass': self._processText,
- 'text/x-less': self._processText,
- 'text/x-vue': self._processText,
- 'text/x-svelte': self._processText,
- 'text/x-astro': self._processText,
-
- # Configuration and build files
- 'application/x-yaml': self._processText,
- 'application/toml': self._processText,
- 'text/x-dockerfile': self._processText,
- 'text/x-makefile': self._processText,
- 'text/x-cmake': self._processText,
- 'text/x-gradle': self._processText,
- 'text/x-maven': self._processText,
-
- # Documentation and markup
- 'text/markdown': self._processText,
- 'text/x-rst': self._processText,
- 'application/x-tex': self._processText,
- 'text/x-bibtex': self._processText,
- 'text/asciidoc': self._processText,
- 'text/x-wiki': self._processText,
-
- # Images
- 'image/jpeg': self._processImage,
- 'image/png': self._processImage,
- 'image/gif': self._processImage,
- 'image/webp': self._processImage,
- 'image/bmp': self._processImage,
- 'image/tiff': self._processImage,
- 'image/x-icon': self._processImage,
-
- # Documents
- 'application/pdf': self._processPdf,
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': self._processDocx,
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': self._processXlsx,
- 'application/vnd.openxmlformats-officedocument.presentationml.presentation': self._processPptx,
- 'application/vnd.oasis.opendocument.text': self._processText,
- 'application/vnd.oasis.opendocument.spreadsheet': self._processText,
- 'application/vnd.oasis.opendocument.presentation': self._processText,
-
- # Legacy Office formats
- 'application/msword': self._processLegacyDoc,
- 'application/vnd.ms-excel': self._processLegacyXls,
- 'application/vnd.ms-powerpoint': self._processLegacyPpt
- }
-
- self.chunkSizes = {
- "text": 40000, # General text content
- "plain": 40000, # Plain text
- "csv": 40000, # CSV data
- "json": 40000, # JSON data
- "xml": 40000, # XML data
- "html": 40000, # HTML content
- "markdown": 40000, # Markdown content
- "code": 80000, # Programming code (increased for better preservation)
- "script": 80000, # Script files (increased for better preservation)
- "javascript": 80000, # JavaScript files specifically
- "typescript": 80000, # TypeScript files specifically
- "config": 40000, # Configuration files
- "image": 1024 * 1024, # 1MB for images
- "video": 5 * 1024 * 1024, # 5MB for video chunks
- "binary": 1024 * 1024, # 1MB for binary data
- "pdf": 40000, # PDF text content
- "docx": 40000, # Word document text
- "xlsx": 40000, # Excel data
- "svg": 40000 # SVG content
- }
-
- def _robustTextDecode(self, fileData: bytes, fileName: str = "unknown") -> str:
- """
- Robustly decode text data with multiple encoding fallbacks.
-
- Args:
- fileData: Raw bytes to decode
- fileName: fileName for logging purposes
-
- Returns:
- Decoded text string
-
- Raises:
- FileProcessingError: If all decoding attempts fail
- """
- # Try multiple encoding options in order of likelihood
- encodings_to_try = ['utf-8', 'windows-1252', 'iso-8859-1', 'latin-1', 'cp1252']
- content = None
-
- # First try UTF-8 (most common)
- try:
- content = fileData.decode('utf-8')
-
- return content
- except UnicodeDecodeError:
- pass
-
- # Try other encodings
- for encoding in encodings_to_try[1:]:
- try:
- content = fileData.decode(encoding)
-
- return content
- except UnicodeDecodeError:
- continue
-
- # If all encodings fail, try with error handling
- try:
- # Try with chardet for automatic detection
- import chardet
- detected = chardet.detect(fileData)
- if detected['confidence'] > 0.7:
- detected_encoding = detected['encoding']
- content = fileData.decode(detected_encoding, errors='replace')
-
- return content
- else:
- # Last resort: decode with replacement characters
- content = fileData.decode('utf-8', errors='replace')
- logger.warning(f"{fileName}: decoded with UTF-8 and replacement characters due to low encoding confidence")
- return content
- except ImportError:
- # chardet not available, use replacement characters
- content = fileData.decode('utf-8', errors='replace')
- logger.warning(f"{fileName}: decoded with UTF-8 and replacement characters (chardet not available)")
- return content
-
- # This should never be reached, but just in case
- raise FileProcessingError(f"Failed to decode {fileName} with any encoding")
-
- def _loadPdfExtractor(self):
- """Loads PDF extraction libraries when needed"""
- global pdfExtractorLoaded
- if not pdfExtractorLoaded:
- try:
- global PyPDF2, fitz
- import PyPDF2
- import fitz # PyMuPDF for more extensive PDF processing
- pdfExtractorLoaded = True
- logger.debug("PDF extraction libraries successfully loaded")
- except ImportError as e:
- logger.warning(f"PDF extraction libraries could not be loaded: {e}")
-
- def _loadOfficeExtractor(self):
- """Loads Office document extraction libraries when needed"""
- global officeExtractorLoaded
- if not officeExtractorLoaded:
- try:
- global docx, openpyxl
- import docx # python-docx for Word documents
- import openpyxl # for Excel files
- officeExtractorLoaded = True
- logger.debug("Office extraction libraries successfully loaded")
- except ImportError as e:
- logger.warning(f"Office extraction libraries could not be loaded: {e}")
-
- def _loadImageProcessor(self):
- """Loads image processing libraries when needed"""
- global imageProcessorLoaded
- if not imageProcessorLoaded:
- try:
- global PIL, Image
- from PIL import Image
- imageProcessorLoaded = True
- logger.debug("Image processing libraries successfully loaded")
- except ImportError as e:
- logger.warning(f"Image processing libraries could not be loaded: {e}")
-
-
-
- async def processFileData(self, fileData: bytes, fileName: str, mimeType: str, base64Encoded: bool = False, prompt: str = None, documentId: str = None, enableAI: bool = True) -> ExtractedContent:
- """
- Process file data directly and extract its contents with optional AI processing.
-
- Args:
- fileData: Raw file data as bytes
- fileName: Name of the file
- mimeType: MIME type of the file
- base64Encoded: Whether the data is base64 encoded
- prompt: Prompt for AI content extraction
- documentId: Optional document ID
- enableAI: Whether to enable AI processing (default: True)
-
- Returns:
- ExtractedContent containing the processed content
-
- Raises:
- FileProcessingError: If document processing fails
- """
- try:
- # Decode base64 if needed
- if base64Encoded:
- fileData = base64.b64decode(fileData)
- # Use subDocumentUtility for mime type detection
- if mimeType == "application/octet-stream":
- mimeType = detectMimeTypeFromData(fileData, fileName, self._serviceCenter)
- # Process document based on type
- if mimeType not in self.supportedTypes:
- contentItems = await self._processBinary(fileData, fileName, mimeType)
- else:
- processor = self.supportedTypes[mimeType]
- contentItems = await processor(fileData, fileName, mimeType)
-
- # Process with AI if prompt provided and AI is enabled
- if enableAI and prompt and contentItems:
- try:
- # Process each content item with AI
- processedItems = await self._aiDataExtraction(contentItems, prompt)
- contentItems = processedItems
- except Exception as e:
- logger.error(f"Error processing content with AI: {str(e)}")
- elif not enableAI:
- logger.debug(f"AI processing disabled for {fileName}, returning raw extracted content")
-
- return ExtractedContent(
- id=documentId if documentId else str(uuid.uuid4()),
- contents=contentItems
- )
-
- except Exception as e:
- logger.error(f"Error processing file data: {str(e)}")
- raise FileProcessingError(f"Failed to process file data: {str(e)}")
-
-
-
- async def _processText(self, fileData: bytes, fileName: str, mimeType: str) -> List[ContentItem]:
- """Process text document with robust encoding detection and complete content extraction"""
- try:
- content = self._robustTextDecode(fileData, fileName)
-
- # Validate that we got the complete content
- if not content or len(content.strip()) == 0:
- logger.warning(f"Empty content extracted from {fileName}")
- return [ContentItem(
- label="empty",
- data="[Empty file or no readable content]",
- metadata=ContentMetadata(
- size=0,
- pages=1,
- mimeType="text/plain",
- base64Encoded=False
- )
- )]
-
- # Log content size for debugging
- content_size = len(content.encode('utf-8'))
-
-
- # Use subDocumentUtility for mime type
- mime_type = getMimeTypeFromExtension(getFileExtension(fileName))
- return [ContentItem(
- label="main",
- data=content,
- metadata=ContentMetadata(
- size=content_size,
- pages=1,
- mimeType=mime_type,
- base64Encoded=False
- )
- )]
- except Exception as e:
- logger.error(f"Error processing text document: {str(e)}")
- raise FileProcessingError(f"Failed to process text document: {str(e)}")
-
- async def _processCsv(self, fileData: bytes, fileName: str, mimeType: str) -> List[ContentItem]:
- """Process CSV document with robust encoding detection"""
- try:
- content = self._robustTextDecode(fileData, fileName)
- mime_type = getMimeTypeFromExtension(getFileExtension(fileName))
- return [ContentItem(
- label="main",
- data=content,
- metadata=ContentMetadata(
- size=len(content.encode('utf-8')),
- pages=1,
- mimeType=mime_type,
- base64Encoded=False
- )
- )]
- except Exception as e:
- logger.error(f"Error processing CSV document: {str(e)}")
- raise FileProcessingError(f"Failed to process CSV document: {str(e)}")
-
- async def _processJson(self, fileData: bytes, fileName: str, mimeType: str) -> List[ContentItem]:
- """Process JSON document with robust encoding detection"""
- try:
- content = self._robustTextDecode(fileData, fileName)
- jsonData = json.loads(content)
- mime_type = getMimeTypeFromExtension(getFileExtension(fileName))
- return [ContentItem(
- label="main",
- data=content,
- metadata=ContentMetadata(
- size=len(content.encode('utf-8')),
- pages=1,
- mimeType=mime_type,
- base64Encoded=False
- )
- )]
- except Exception as e:
- logger.error(f"Error processing JSON document: {str(e)}")
- raise FileProcessingError(f"Failed to process JSON document: {str(e)}")
-
- async def _processXml(self, fileData: bytes, fileName: str, mimeType: str) -> List[ContentItem]:
- """Process XML document with robust encoding detection"""
- try:
- content = self._robustTextDecode(fileData, fileName)
- mime_type = getMimeTypeFromExtension(getFileExtension(fileName))
- return [ContentItem(
- label="main",
- data=content,
- metadata=ContentMetadata(
- size=len(content.encode('utf-8')),
- pages=1,
- mimeType=mime_type,
- base64Encoded=False
- )
- )]
- except Exception as e:
- logger.error(f"Error processing XML document: {str(e)}")
- raise FileProcessingError(f"Failed to process XML document: {str(e)}")
-
- async def _processHtml(self, fileData: bytes, fileName: str, mimeType: str) -> List[ContentItem]:
- """Process HTML document with robust encoding detection"""
- try:
- content = self._robustTextDecode(fileData, fileName)
- mime_type = getMimeTypeFromExtension(getFileExtension(fileName))
- return [ContentItem(
- label="main",
- data=content,
- metadata=ContentMetadata(
- size=len(content.encode('utf-8')),
- pages=1,
- mimeType=mime_type,
- base64Encoded=False
- )
- )]
- except Exception as e:
- logger.error(f"Error processing HTML document: {str(e)}")
- raise FileProcessingError(f"Failed to process HTML document: {str(e)}")
-
- async def _processSvg(self, fileData: bytes, fileName: str, mimeType: str) -> List[ContentItem]:
- """Process SVG document with robust encoding detection and meaningful content extraction"""
- try:
- content = self._robustTextDecode(fileData, fileName)
-
- # Check if it's actually SVG content
- if "