From 4e15be8296729fcde3990630da40b9684e4f8329 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Tue, 22 Jul 2025 18:15:02 +0200
Subject: [PATCH] refactored document handling
---
modules/chat/documents/documentCreation.py | 124 -----
...entProcessing.py => documentExtraction.py} | 40 +-
modules/chat/documents/documentGeneration.py | 163 ++++++
modules/chat/documents/documentUtility.py | 132 +++++
modules/chat/handling/handlingActions.py | 466 +++++-------------
modules/chat/handling/handlingTasks.py | 12 +-
modules/chat/serviceCenter.py | 29 +-
notes/changelog.txt | 6 -
notes/methodbased_specification.md | 4 +-
9 files changed, 483 insertions(+), 493 deletions(-)
delete mode 100644 modules/chat/documents/documentCreation.py
rename modules/chat/documents/{documentProcessing.py => documentExtraction.py} (96%)
create mode 100644 modules/chat/documents/documentGeneration.py
create mode 100644 modules/chat/documents/documentUtility.py
diff --git a/modules/chat/documents/documentCreation.py b/modules/chat/documents/documentCreation.py
deleted file mode 100644
index 49e16580..00000000
--- a/modules/chat/documents/documentCreation.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Contains all document creation functions extracted from managerChat.py
-
-import logging
-import json
-from typing import Dict, Any, Optional, List, Union
-from datetime import datetime, UTC
-
-class DocumentCreator:
- def __init__(self, service):
- self.service = service
-
- def getFileExtension(self, filename: str) -> str:
- """Extract file extension from filename"""
- return self.service.getFileExtension(filename)
-
- def getMimeType(self, extension: str) -> str:
- """Get MIME type based on file extension"""
- return self.service.getMimeTypeFromExtension(extension)
-
- def detectMimeTypeFromContent(self, content: Any, filename: str) -> str:
- """
- Detect MIME type from content and filename using service center.
- Only returns a detected MIME type if it's better than application/octet-stream.
- """
- try:
- if isinstance(content, str):
- file_bytes = content.encode('utf-8')
- elif isinstance(content, dict):
- file_bytes = json.dumps(content, ensure_ascii=False).encode('utf-8')
- else:
- file_bytes = str(content).encode('utf-8')
- detected_mime_type = self.service.detectContentTypeFromData(file_bytes, filename)
- if detected_mime_type != "application/octet-stream":
- return detected_mime_type
- return "application/octet-stream"
- except Exception as e:
- logging.warning(f"Error in MIME type detection for {filename}: {str(e)}")
- return 'application/octet-stream'
-
- def detectMimeTypeFromDocument(self, document: Any, filename: str) -> str:
- """
- Detect MIME type from document object using service center.
- Only returns a detected MIME type if it's better than application/octet-stream.
- """
- try:
- content = getattr(document, 'content', '')
- if isinstance(content, str):
- file_bytes = content.encode('utf-8')
- else:
- file_bytes = str(content).encode('utf-8')
- detected_mime_type = self.service.detectContentTypeFromData(file_bytes, filename)
- if detected_mime_type != "application/octet-stream":
- return detected_mime_type
- return "application/octet-stream"
- except Exception as e:
- logging.warning(f"Error in MIME type detection for document {filename}: {str(e)}")
- return 'application/octet-stream'
-
- def convertDocumentDataToString(self, document_data: Dict[str, Any], file_extension: str) -> str:
- """Convert document data to string content based on file type with enhanced processing"""
- try:
- if document_data is None:
- return ""
- if isinstance(document_data, str):
- return document_data
- if isinstance(document_data, dict):
- if file_extension == 'json':
- return json.dumps(document_data, indent=2, ensure_ascii=False)
- elif file_extension in ['txt', 'md', 'html', 'css', 'js', 'py']:
- text_fields = ['content', 'text', 'data', 'result', 'summary', 'extracted_content', 'table_data']
- for field in text_fields:
- if field in document_data:
- content = document_data[field]
- if isinstance(content, str):
- return content
- elif isinstance(content, (dict, list)):
- return json.dumps(content, indent=2, ensure_ascii=False)
- return json.dumps(document_data, indent=2, ensure_ascii=False)
- elif file_extension == 'csv':
- csv_fields = ['table_data', 'csv_data', 'rows', 'data', 'content', 'text']
- for field in csv_fields:
- if field in document_data:
- content = document_data[field]
- if isinstance(content, str):
- return content
- elif isinstance(content, list):
- if content and isinstance(content[0], (list, dict)):
- import csv
- import io
- output = io.StringIO()
- if isinstance(content[0], dict):
- if content:
- fieldnames = content[0].keys()
- writer = csv.DictWriter(output, fieldnames=fieldnames)
- writer.writeheader()
- writer.writerows(content)
- else:
- writer = csv.writer(output)
- writer.writerows(content)
- return output.getvalue()
- return json.dumps(document_data, indent=2, ensure_ascii=False)
- else:
- return json.dumps(document_data, indent=2, ensure_ascii=False)
- elif isinstance(document_data, list):
- if file_extension == 'csv':
- import csv
- import io
- output = io.StringIO()
- if document_data and isinstance(document_data[0], dict):
- fieldnames = document_data[0].keys()
- writer = csv.DictWriter(output, fieldnames=fieldnames)
- writer.writeheader()
- writer.writerows(document_data)
- else:
- writer = csv.writer(output)
- writer.writerows(document_data)
- return output.getvalue()
- else:
- return json.dumps(document_data, indent=2, ensure_ascii=False)
- else:
- return str(document_data)
- except Exception as e:
- logging.error(f"Error converting document data to string: {str(e)}")
- return str(document_data)
\ No newline at end of file
diff --git a/modules/chat/documents/documentProcessing.py b/modules/chat/documents/documentExtraction.py
similarity index 96%
rename from modules/chat/documents/documentProcessing.py
rename to modules/chat/documents/documentExtraction.py
index 323c6f7f..8bd1a563 100644
--- a/modules/chat/documents/documentProcessing.py
+++ b/modules/chat/documents/documentExtraction.py
@@ -9,6 +9,13 @@ from pathlib import Path
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import uuid
+from .documentUtility import (
+ getFileExtension,
+ getMimeTypeFromExtension,
+ detectMimeTypeFromContent,
+ detectMimeTypeFromData,
+ convertDocumentDataToString
+)
from modules.interfaces.interfaceChatModel import (
ExtractedContent,
@@ -29,7 +36,7 @@ class FileProcessingError(Exception):
"""Custom exception for file processing errors."""
pass
-class DocumentProcessor:
+class DocumentExtraction:
"""Processor for handling document operations and content extraction."""
def __init__(self, serviceCenter=None):
@@ -133,17 +140,13 @@ class DocumentProcessor:
# Decode base64 if needed
if base64Encoded:
fileData = base64.b64decode(fileData)
-
- # Detect content type if needed
+ # Use documentUtility for mime type detection
if mimeType == "application/octet-stream":
- mimeType = self._serviceCenter.detectContentTypeFromData(fileData, filename)
-
+ mimeType = detectMimeTypeFromData(fileData, filename, self._serviceCenter)
# Process document based on type
if mimeType not in self.supportedTypes:
- # Fallback to binary processing
contentItems = await self._processBinary(fileData, filename, mimeType)
else:
- # Process document based on type
processor = self.supportedTypes[mimeType]
contentItems = await processor(fileData, filename, mimeType)
@@ -171,13 +174,15 @@ class DocumentProcessor:
"""Process text document"""
try:
content = fileData.decode('utf-8')
+ # Use documentUtility for mime type
+ mime_type = getMimeTypeFromExtension(getFileExtension(filename), self._serviceCenter)
return [ContentItem(
label="main",
data=content,
metadata=ContentMetadata(
size=len(content.encode('utf-8')),
pages=1,
- mimeType="text/plain",
+ mimeType=mime_type,
base64Encoded=False
)
)]
@@ -189,13 +194,14 @@ class DocumentProcessor:
"""Process CSV document"""
try:
content = fileData.decode('utf-8')
+ mime_type = getMimeTypeFromExtension(getFileExtension(filename), self._serviceCenter)
return [ContentItem(
label="main",
data=content,
metadata=ContentMetadata(
size=len(content.encode('utf-8')),
pages=1,
- mimeType="text/csv",
+ mimeType=mime_type,
base64Encoded=False
)
)]
@@ -207,16 +213,15 @@ class DocumentProcessor:
"""Process JSON document"""
try:
content = fileData.decode('utf-8')
- # Parse JSON to validate
jsonData = json.loads(content)
-
+ mime_type = getMimeTypeFromExtension(getFileExtension(filename), self._serviceCenter)
return [ContentItem(
label="main",
data=content,
metadata=ContentMetadata(
size=len(content.encode('utf-8')),
pages=1,
- mimeType="application/json",
+ mimeType=mime_type,
base64Encoded=False
)
)]
@@ -228,13 +233,14 @@ class DocumentProcessor:
"""Process XML document"""
try:
content = fileData.decode('utf-8')
+ mime_type = getMimeTypeFromExtension(getFileExtension(filename), self._serviceCenter)
return [ContentItem(
label="main",
data=content,
metadata=ContentMetadata(
size=len(content.encode('utf-8')),
pages=1,
- mimeType="application/xml",
+ mimeType=mime_type,
base64Encoded=False
)
)]
@@ -246,13 +252,14 @@ class DocumentProcessor:
"""Process HTML document"""
try:
content = fileData.decode('utf-8')
+ mime_type = getMimeTypeFromExtension(getFileExtension(filename), self._serviceCenter)
return [ContentItem(
label="main",
data=content,
metadata=ContentMetadata(
size=len(content.encode('utf-8')),
pages=1,
- mimeType="text/html",
+ mimeType=mime_type,
base64Encoded=False
)
)]
@@ -264,15 +271,14 @@ class DocumentProcessor:
"""Process SVG document"""
try:
content = fileData.decode('utf-8')
- # Check if it's actually SVG
isSvg = "