unlimited file size and normalizer initially included
This commit is contained in:
parent
ea5f42d981
commit
2f7d73f2ce
3 changed files with 346 additions and 148 deletions
|
|
@ -88,6 +88,8 @@ class ContentItem(BaseModel, ModelMixin):
|
|||
"""Individual content item from a document"""
|
||||
label: str = Field(description="Content label (e.g., tab name, tag name)")
|
||||
data: str = Field(description="Extracted text content")
|
||||
mimeType: str = Field(description="MIME type of the content")
|
||||
base64Encoded: bool = Field(description="Whether the data is base64 encoded")
|
||||
metadata: ContentMetadata = Field(description="Content metadata")
|
||||
|
||||
# Register labels for ContentItem
|
||||
|
|
|
|||
|
|
@ -1,106 +0,0 @@
|
|||
"""
|
||||
Document Manager Module for handling document operations and content extraction.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import logging
|
||||
from typing import List, Optional, Dict, Any, Union
|
||||
from pathlib import Path
|
||||
import uuid
|
||||
|
||||
from modules.interfaces.serviceChatModel import (
|
||||
ChatDocument,
|
||||
TaskDocument,
|
||||
ExtractedContent,
|
||||
ContentItem,
|
||||
ContentMetadata
|
||||
)
|
||||
from modules.workflow.serviceContainer import ServiceContainer
|
||||
from modules.workflow.processorDocument import DocumentProcessor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DocumentManager:
|
||||
"""Manager for document operations and content extraction"""
|
||||
|
||||
def __init__(self, serviceContainer: ServiceContainer):
|
||||
self.service = serviceContainer
|
||||
self._processor = DocumentProcessor()
|
||||
|
||||
async def extractFromChatDocument(self, prompt: str, document: ChatDocument) -> ExtractedContent:
|
||||
"""
|
||||
Extract content from a ChatDocument with AI processing.
|
||||
|
||||
Args:
|
||||
prompt: Prompt for AI content extraction
|
||||
document: The ChatDocument to process
|
||||
|
||||
Returns:
|
||||
ExtractedContent containing the processed content
|
||||
"""
|
||||
# Convert ChatDocument to TaskDocument
|
||||
taskDoc = await self._convertToTaskDocument(document)
|
||||
|
||||
# Process document using processor
|
||||
extractedContent = await self._processor.processDocument(taskDoc, prompt)
|
||||
|
||||
# Update the objectId and objectType to reference the original ChatDocument
|
||||
extractedContent.objectId = document.id
|
||||
extractedContent.objectType = "ChatDocument"
|
||||
|
||||
return extractedContent
|
||||
|
||||
async def extractFromTaskDocument(self, prompt: str, document: TaskDocument) -> ExtractedContent:
|
||||
"""
|
||||
Extract content directly from a task document.
|
||||
|
||||
Args:
|
||||
prompt: The prompt to use for content extraction
|
||||
document: The task document to extract content from
|
||||
|
||||
Returns:
|
||||
ExtractedContent containing the processed content
|
||||
|
||||
Raises:
|
||||
ValueError: If document is invalid
|
||||
IOError: If file cannot be read
|
||||
"""
|
||||
try:
|
||||
return await self._processor.processDocument(document, prompt)
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting from task document: {str(e)}")
|
||||
raise
|
||||
|
||||
async def _convertToTaskDocument(self, chatDoc: ChatDocument) -> TaskDocument:
|
||||
"""
|
||||
Convert a ChatDocument to a TaskDocument.
|
||||
|
||||
Args:
|
||||
chatDoc: The chat document to convert
|
||||
|
||||
Returns:
|
||||
TaskDocument containing the converted data
|
||||
|
||||
Raises:
|
||||
ValueError: If document is invalid
|
||||
IOError: If file cannot be read
|
||||
"""
|
||||
try:
|
||||
# Get file content
|
||||
fileContent = await self.service.functions.getFileData(chatDoc.fileId)
|
||||
if not fileContent:
|
||||
raise ValueError(f"Could not get content for file {chatDoc.fileId}")
|
||||
|
||||
# Convert to base64
|
||||
base64Data = base64.b64encode(fileContent).decode('utf-8')
|
||||
|
||||
return TaskDocument(
|
||||
id=str(uuid.uuid4()),
|
||||
filename=chatDoc.filename,
|
||||
fileSize=chatDoc.fileSize,
|
||||
mimeType=chatDoc.mimeType,
|
||||
data=base64Data
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting chat document to task document: {str(e)}")
|
||||
raise
|
||||
|
|
@ -6,6 +6,8 @@ import io
|
|||
import base64
|
||||
from datetime import datetime, UTC
|
||||
from pathlib import Path
|
||||
import xml.etree.ElementTree as ET
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from modules.interfaces.serviceChatModel import (
|
||||
ChatDocument,
|
||||
|
|
@ -35,8 +37,11 @@ class DocumentProcessor:
|
|||
|
||||
def __init__(self, currentUser: Optional[User] = None):
|
||||
"""Initialize the document processor."""
|
||||
|
||||
self.serviceManagement = getInterface(currentUser)
|
||||
|
||||
self._neutralizer = DataAnonymizer() if APP_CONFIG.get("ENABLE_CONTENT_NEUTRALIZATION", False) else None
|
||||
|
||||
self.supportedTypes: Dict[str, Callable[[Union[ChatDocument, TaskDocument]], Awaitable[List[ContentItem]]]] = {
|
||||
'text/plain': self._processText,
|
||||
'text/csv': self._processCsv,
|
||||
|
|
@ -51,6 +56,22 @@ class DocumentProcessor:
|
|||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': self._processDocx,
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': self._processXlsx
|
||||
}
|
||||
|
||||
self.chunkSizes = {
|
||||
"text": 40000, # General text content
|
||||
"plain": 40000, # Plain text
|
||||
"csv": 40000, # CSV data
|
||||
"json": 40000, # JSON data
|
||||
"xml": 40000, # XML data
|
||||
"html": 40000, # HTML content
|
||||
"image": 1024 * 1024, # 1MB for images
|
||||
"video": 5 * 1024 * 1024, # 5MB for video chunks
|
||||
"binary": 1024 * 1024, # 1MB for binary data
|
||||
"pdf": 40000, # PDF text content
|
||||
"docx": 40000, # Word document text
|
||||
"xlsx": 40000, # Excel data
|
||||
"svg": 40000 # SVG content
|
||||
}
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""Initialize the document processor."""
|
||||
|
|
@ -127,37 +148,7 @@ class DocumentProcessor:
|
|||
if prompt and contentItems:
|
||||
try:
|
||||
# Process each content item with AI
|
||||
processedItems = []
|
||||
for item in contentItems:
|
||||
# Neutralize content if neutralizer is enabled
|
||||
contentToProcess = item.data
|
||||
if self._neutralizer and contentToProcess:
|
||||
contentToProcess = self._neutralizer.neutralize(contentToProcess)
|
||||
|
||||
# Create AI prompt for this content
|
||||
aiPrompt = f"""
|
||||
Extract relevant information from this content based on the following prompt:
|
||||
|
||||
PROMPT: {prompt}
|
||||
|
||||
CONTENT:
|
||||
{contentToProcess}
|
||||
|
||||
Return ONLY the extracted information in a clear, concise format.
|
||||
"""
|
||||
|
||||
# Get AI response
|
||||
response = await self.serviceManagement.callAi([
|
||||
{"role": "system", "content": "You are an expert at extracting relevant information from documents."},
|
||||
{"role": "user", "content": aiPrompt}
|
||||
])
|
||||
|
||||
# Update content with AI processed data
|
||||
processedItems.append(ContentItem(
|
||||
label=item.label,
|
||||
data=response.strip(),
|
||||
metadata=item.metadata
|
||||
))
|
||||
processedItems = await self._aiDataExtraction(contentItems, prompt)
|
||||
|
||||
contentItems = processedItems
|
||||
|
||||
|
|
@ -235,7 +226,9 @@ class DocumentProcessor:
|
|||
data=content,
|
||||
metadata=ContentMetadata(
|
||||
size=len(content.encode('utf-8')),
|
||||
pages=1
|
||||
pages=1,
|
||||
mimeType="text/plain",
|
||||
base64Encoded=False
|
||||
)
|
||||
)]
|
||||
except Exception as e:
|
||||
|
|
@ -258,7 +251,9 @@ class DocumentProcessor:
|
|||
data=content,
|
||||
metadata=ContentMetadata(
|
||||
size=len(content.encode('utf-8')),
|
||||
pages=1
|
||||
pages=1,
|
||||
mimeType="text/csv",
|
||||
base64Encoded=False
|
||||
)
|
||||
)]
|
||||
except Exception as e:
|
||||
|
|
@ -284,7 +279,9 @@ class DocumentProcessor:
|
|||
data=content,
|
||||
metadata=ContentMetadata(
|
||||
size=len(content.encode('utf-8')),
|
||||
pages=1
|
||||
pages=1,
|
||||
mimeType="application/json",
|
||||
base64Encoded=False
|
||||
)
|
||||
)]
|
||||
except Exception as e:
|
||||
|
|
@ -307,7 +304,9 @@ class DocumentProcessor:
|
|||
data=content,
|
||||
metadata=ContentMetadata(
|
||||
size=len(content.encode('utf-8')),
|
||||
pages=1
|
||||
pages=1,
|
||||
mimeType="application/xml",
|
||||
base64Encoded=False
|
||||
)
|
||||
)]
|
||||
except Exception as e:
|
||||
|
|
@ -330,7 +329,9 @@ class DocumentProcessor:
|
|||
data=content,
|
||||
metadata=ContentMetadata(
|
||||
size=len(content.encode('utf-8')),
|
||||
pages=1
|
||||
pages=1,
|
||||
mimeType="text/html",
|
||||
base64Encoded=False
|
||||
)
|
||||
)]
|
||||
except Exception as e:
|
||||
|
|
@ -356,6 +357,8 @@ class DocumentProcessor:
|
|||
data=content if isSvg else None,
|
||||
metadata=ContentMetadata(
|
||||
size=len(content.encode('utf-8')),
|
||||
mimeType="image/svg+xml",
|
||||
base64Encoded=False,
|
||||
error=None if isSvg else "Invalid SVG content"
|
||||
)
|
||||
)]
|
||||
|
|
@ -383,7 +386,9 @@ class DocumentProcessor:
|
|||
size=len(fileData),
|
||||
width=img.width,
|
||||
height=img.height,
|
||||
colorMode=img.mode
|
||||
colorMode=img.mode,
|
||||
mimeType=document.mimeType,
|
||||
base64Encoded=True
|
||||
)
|
||||
|
||||
# Convert image to base64 for storage
|
||||
|
|
@ -420,7 +425,9 @@ class DocumentProcessor:
|
|||
pdfReader = PyPDF2.PdfReader(pdfStream)
|
||||
metadata = ContentMetadata(
|
||||
size=len(fileData),
|
||||
pages=len(pdfReader.pages)
|
||||
pages=len(pdfReader.pages),
|
||||
mimeType="application/pdf",
|
||||
base64Encoded=False
|
||||
)
|
||||
|
||||
# Extract text from all pages
|
||||
|
|
@ -433,7 +440,9 @@ class DocumentProcessor:
|
|||
data=pageText,
|
||||
metadata=ContentMetadata(
|
||||
size=len(pageText.encode('utf-8')),
|
||||
pages=1
|
||||
pages=1,
|
||||
mimeType="text/plain",
|
||||
base64Encoded=False
|
||||
)
|
||||
))
|
||||
|
||||
|
|
@ -456,7 +465,9 @@ class DocumentProcessor:
|
|||
data=base64.b64encode(imageBytes).decode('utf-8'),
|
||||
metadata=ContentMetadata(
|
||||
size=len(imageBytes),
|
||||
pages=1
|
||||
pages=1,
|
||||
mimeType=f"image/{imageExt}",
|
||||
base64Encoded=True
|
||||
)
|
||||
))
|
||||
except Exception as imgE:
|
||||
|
|
@ -506,7 +517,9 @@ class DocumentProcessor:
|
|||
data=content,
|
||||
metadata=ContentMetadata(
|
||||
size=len(content.encode('utf-8')),
|
||||
pages=len(doc.paragraphs)
|
||||
pages=len(doc.paragraphs),
|
||||
mimeType="text/plain",
|
||||
base64Encoded=False
|
||||
)
|
||||
)]
|
||||
except Exception as e:
|
||||
|
|
@ -551,7 +564,9 @@ class DocumentProcessor:
|
|||
data=content,
|
||||
metadata=ContentMetadata(
|
||||
size=len(content.encode('utf-8')),
|
||||
pages=1
|
||||
pages=1,
|
||||
mimeType="text/csv",
|
||||
base64Encoded=False
|
||||
)
|
||||
))
|
||||
|
||||
|
|
@ -575,6 +590,8 @@ class DocumentProcessor:
|
|||
data=base64.b64encode(fileData).decode('utf-8'),
|
||||
metadata=ContentMetadata(
|
||||
size=len(fileData),
|
||||
mimeType=document.mimeType,
|
||||
base64Encoded=True,
|
||||
error="Unsupported file type"
|
||||
)
|
||||
)]
|
||||
|
|
@ -582,6 +599,291 @@ class DocumentProcessor:
|
|||
logger.error(f"Error processing binary document: {str(e)}")
|
||||
raise FileProcessingError(f"Failed to process binary document: {str(e)}")
|
||||
|
||||
async def _aiDataExtraction(self, contentItems: List[ContentItem], prompt: str) -> List[ContentItem]:
|
||||
"""
|
||||
Process content items with AI, handling chunking based on content type.
|
||||
|
||||
Args:
|
||||
contentItems: List of content items to process
|
||||
prompt: Prompt for AI content extraction
|
||||
|
||||
Returns:
|
||||
List of processed content items
|
||||
"""
|
||||
processedItems = []
|
||||
|
||||
for item in contentItems:
|
||||
try:
|
||||
# Get content type from metadata
|
||||
mimeType = item.metadata.mimeType if hasattr(item.metadata, 'mimeType') else "text/plain"
|
||||
|
||||
# Chunk content based on type
|
||||
if mimeType.startswith('text/'):
|
||||
chunks = await self._chunkText(item.data, mimeType)
|
||||
elif mimeType.startswith('image/'):
|
||||
chunks = await self._chunkImage(item.data)
|
||||
elif mimeType.startswith('video/'):
|
||||
chunks = await self._chunkVideo(item.data)
|
||||
else:
|
||||
# Binary data - no chunking
|
||||
chunks = [item.data]
|
||||
|
||||
# Process each chunk
|
||||
chunkResults = []
|
||||
for chunk in chunks:
|
||||
# Neutralize content if neutralizer is enabled
|
||||
contentToProcess = chunk
|
||||
if self._neutralizer and contentToProcess:
|
||||
contentToProcess = self._neutralizer.neutralize(contentToProcess)
|
||||
|
||||
# Create AI prompt for this chunk
|
||||
aiPrompt = f"""
|
||||
Extract relevant information from this content based on the following prompt:
|
||||
|
||||
PROMPT: {prompt}
|
||||
|
||||
CONTENT:
|
||||
{contentToProcess}
|
||||
|
||||
Return ONLY the extracted information in a clear, concise format.
|
||||
"""
|
||||
|
||||
# Get AI response
|
||||
response = await self.serviceManagement.callAi([
|
||||
{"role": "system", "content": "You are an expert at extracting relevant information from documents."},
|
||||
{"role": "user", "content": aiPrompt}
|
||||
])
|
||||
|
||||
chunkResults.append(response.strip())
|
||||
|
||||
# Combine chunk results
|
||||
combinedResult = "\n".join(chunkResults)
|
||||
|
||||
# Update content with AI processed data
|
||||
processedItems.append(ContentItem(
|
||||
label=item.label,
|
||||
data=combinedResult,
|
||||
metadata=ContentMetadata(
|
||||
size=len(combinedResult.encode('utf-8')),
|
||||
pages=1,
|
||||
mimeType="text/plain",
|
||||
base64Encoded=False
|
||||
)
|
||||
))
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing content chunk: {str(e)}")
|
||||
# Add original content if processing fails
|
||||
processedItems.append(item)
|
||||
|
||||
return processedItems
|
||||
|
||||
def _chunkText(self, content: str, mimeType: str) -> List[str]:
|
||||
"""Chunk text content based on mime type"""
|
||||
if mimeType == "text/plain":
|
||||
return self._chunkPlainText(content)
|
||||
elif mimeType == "text/csv":
|
||||
return self._chunkCsv(content)
|
||||
elif mimeType == "application/json":
|
||||
return self._chunkJson(content)
|
||||
elif mimeType == "application/xml":
|
||||
return self._chunkXml(content)
|
||||
elif mimeType == "text/html":
|
||||
return self._chunkHtml(content)
|
||||
else:
|
||||
return self._chunkPlainText(content)
|
||||
|
||||
def _chunkPlainText(self, content: str) -> List[str]:
|
||||
"""Chunk plain text content"""
|
||||
chunks = []
|
||||
currentChunk = []
|
||||
currentSize = 0
|
||||
|
||||
for line in content.split('\n'):
|
||||
lineSize = len(line.encode('utf-8'))
|
||||
if currentSize + lineSize > self.chunkSizes["plain"]:
|
||||
if currentChunk:
|
||||
chunks.append('\n'.join(currentChunk))
|
||||
currentChunk = [line]
|
||||
currentSize = lineSize
|
||||
else:
|
||||
currentChunk.append(line)
|
||||
currentSize += lineSize
|
||||
|
||||
if currentChunk:
|
||||
chunks.append('\n'.join(currentChunk))
|
||||
|
||||
return chunks
|
||||
|
||||
def _chunkCsv(self, content: str) -> List[str]:
|
||||
"""Chunk CSV content"""
|
||||
chunks = []
|
||||
currentChunk = []
|
||||
currentSize = 0
|
||||
|
||||
for line in content.split('\n'):
|
||||
lineSize = len(line.encode('utf-8'))
|
||||
if currentSize + lineSize > self.chunkSizes["csv"]:
|
||||
if currentChunk:
|
||||
chunks.append('\n'.join(currentChunk))
|
||||
currentChunk = [line]
|
||||
currentSize = lineSize
|
||||
else:
|
||||
currentChunk.append(line)
|
||||
currentSize += lineSize
|
||||
|
||||
if currentChunk:
|
||||
chunks.append('\n'.join(currentChunk))
|
||||
|
||||
return chunks
|
||||
|
||||
def _chunkJson(self, content: str) -> List[str]:
|
||||
"""Chunk JSON content"""
|
||||
try:
|
||||
data = json.loads(content)
|
||||
chunks = []
|
||||
currentChunk = []
|
||||
currentSize = 0
|
||||
|
||||
def processValue(value, path=""):
|
||||
nonlocal currentChunk, currentSize
|
||||
valueStr = json.dumps({path: value}) if path else json.dumps(value)
|
||||
valueSize = len(valueStr.encode('utf-8'))
|
||||
|
||||
if currentSize + valueSize > self.chunkSizes["json"]:
|
||||
if currentChunk:
|
||||
chunks.append(json.dumps(currentChunk))
|
||||
currentChunk = [value]
|
||||
currentSize = valueSize
|
||||
else:
|
||||
currentChunk.append(value)
|
||||
currentSize += valueSize
|
||||
|
||||
if isinstance(data, list):
|
||||
for i, item in enumerate(data):
|
||||
processValue(item, str(i))
|
||||
elif isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
processValue(value, key)
|
||||
else:
|
||||
processValue(data)
|
||||
|
||||
if currentChunk:
|
||||
chunks.append(json.dumps(currentChunk))
|
||||
|
||||
return chunks
|
||||
except json.JSONDecodeError:
|
||||
return [content]
|
||||
|
||||
def _chunkXml(self, content: str) -> List[str]:
|
||||
"""Chunk XML content"""
|
||||
try:
|
||||
root = ET.fromstring(content)
|
||||
chunks = []
|
||||
currentChunk = []
|
||||
currentSize = 0
|
||||
|
||||
def processElement(element, path=""):
|
||||
nonlocal currentChunk, currentSize
|
||||
elementStr = ET.tostring(element, encoding='unicode')
|
||||
elementSize = len(elementStr.encode('utf-8'))
|
||||
|
||||
if currentSize + elementSize > self.chunkSizes["xml"]:
|
||||
if currentChunk:
|
||||
chunks.append(''.join(currentChunk))
|
||||
currentChunk = [elementStr]
|
||||
currentSize = elementSize
|
||||
else:
|
||||
currentChunk.append(elementStr)
|
||||
currentSize += elementSize
|
||||
|
||||
for child in root:
|
||||
processElement(child)
|
||||
|
||||
if currentChunk:
|
||||
chunks.append(''.join(currentChunk))
|
||||
|
||||
return chunks
|
||||
except ET.ParseError:
|
||||
return [content]
|
||||
|
||||
def _chunkHtml(self, content: str) -> List[str]:
|
||||
"""Chunk HTML content"""
|
||||
try:
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
chunks = []
|
||||
currentChunk = []
|
||||
currentSize = 0
|
||||
|
||||
def processElement(element):
|
||||
nonlocal currentChunk, currentSize
|
||||
elementStr = str(element)
|
||||
elementSize = len(elementStr.encode('utf-8'))
|
||||
|
||||
if currentSize + elementSize > self.chunkSizes["html"]:
|
||||
if currentChunk:
|
||||
chunks.append(''.join(currentChunk))
|
||||
currentChunk = [elementStr]
|
||||
currentSize = elementSize
|
||||
else:
|
||||
currentChunk.append(elementStr)
|
||||
currentSize += elementSize
|
||||
|
||||
for element in soup.find_all(['p', 'div', 'section', 'article']):
|
||||
processElement(element)
|
||||
|
||||
if currentChunk:
|
||||
chunks.append(''.join(currentChunk))
|
||||
|
||||
return chunks
|
||||
except Exception:
|
||||
return [content]
|
||||
|
||||
def _chunkImage(self, content: str) -> List[str]:
|
||||
"""Chunk image content"""
|
||||
try:
|
||||
imageData = base64.b64decode(content)
|
||||
chunks = []
|
||||
chunkSize = self.chunkSizes["image"]
|
||||
|
||||
for i in range(0, len(imageData), chunkSize):
|
||||
chunk = imageData[i:i + chunkSize]
|
||||
chunks.append(base64.b64encode(chunk).decode('utf-8'))
|
||||
|
||||
return chunks
|
||||
except Exception:
|
||||
return [content]
|
||||
|
||||
def _chunkVideo(self, content: str) -> List[str]:
|
||||
"""Chunk video content"""
|
||||
try:
|
||||
videoData = base64.b64decode(content)
|
||||
chunks = []
|
||||
chunkSize = self.chunkSizes["video"]
|
||||
|
||||
for i in range(0, len(videoData), chunkSize):
|
||||
chunk = videoData[i:i + chunkSize]
|
||||
chunks.append(base64.b64encode(chunk).decode('utf-8'))
|
||||
|
||||
return chunks
|
||||
except Exception:
|
||||
return [content]
|
||||
|
||||
def _chunkBinary(self, content: str) -> List[str]:
|
||||
"""Chunk binary content"""
|
||||
try:
|
||||
binaryData = base64.b64decode(content)
|
||||
chunks = []
|
||||
chunkSize = self.chunkSizes["binary"]
|
||||
|
||||
for i in range(0, len(binaryData), chunkSize):
|
||||
chunk = binaryData[i:i + chunkSize]
|
||||
chunks.append(base64.b64encode(chunk).decode('utf-8'))
|
||||
|
||||
return chunks
|
||||
except Exception:
|
||||
return [content]
|
||||
|
||||
async def _extractText(self, content: bytes, mimeType: str) -> str:
|
||||
"""Extract text content from various text formats"""
|
||||
try:
|
||||
|
|
|
|||
Loading…
Reference in a new issue