604 lines
No EOL
23 KiB
Python
604 lines
No EOL
23 KiB
Python
from typing import Dict, Any, List, Optional, Union, Tuple, TypedDict
|
|
import logging
|
|
import json
|
|
import os
|
|
import io
|
|
import base64
|
|
from datetime import datetime, UTC
|
|
from pathlib import Path
|
|
import mimetypes
|
|
import hashlib
|
|
import shutil
|
|
import re
|
|
import uuid
|
|
|
|
from modules.interfaces.serviceChatModel import (
|
|
DocumentContext,
|
|
DocumentExtraction,
|
|
DocumentMetadata,
|
|
DocumentContent,
|
|
ProcessedDocument,
|
|
ImageData
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Optional imports - only loaded when needed
|
|
pdfExtractorLoaded = False
|
|
officeExtractorLoaded = False
|
|
imageProcessorLoaded = False
|
|
|
|
class FileProcessingError(Exception):
|
|
"""Custom exception for file processing errors."""
|
|
pass
|
|
|
|
class DocumentProcessor:
|
|
"""Processes documents with context awareness"""
|
|
|
|
def __init__(self):
|
|
self.supported_types = {
|
|
"text/plain": self._process_text,
|
|
"text/csv": self._process_csv,
|
|
"application/json": self._process_json,
|
|
"text/html": self._process_html,
|
|
"image/svg+xml": self._process_svg,
|
|
"application/pdf": self._process_pdf,
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": self._process_docx,
|
|
"application/msword": self._process_docx,
|
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": self._process_xlsx,
|
|
"application/vnd.ms-excel": self._process_xlsx,
|
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation": self._process_pptx,
|
|
"application/vnd.ms-powerpoint": self._process_pptx
|
|
}
|
|
|
|
# Add image types
|
|
for img_type in ["image/jpeg", "image/png", "image/gif", "image/bmp", "image/tiff"]:
|
|
self.supported_types[img_type] = self._process_image
|
|
|
|
def _load_pdf_extractor(self):
|
|
"""Loads PDF extraction libraries when needed"""
|
|
global pdfExtractorLoaded
|
|
if not pdfExtractorLoaded:
|
|
try:
|
|
global PyPDF2, fitz
|
|
import PyPDF2
|
|
import fitz # PyMuPDF for more extensive PDF processing
|
|
pdfExtractorLoaded = True
|
|
logger.info("PDF extraction libraries successfully loaded")
|
|
except ImportError as e:
|
|
logger.warning(f"PDF extraction libraries could not be loaded: {e}")
|
|
|
|
def _load_office_extractor(self):
|
|
"""Loads Office document extraction libraries when needed"""
|
|
global officeExtractorLoaded
|
|
if not officeExtractorLoaded:
|
|
try:
|
|
global docx, openpyxl
|
|
import docx # python-docx for Word documents
|
|
import openpyxl # for Excel files
|
|
officeExtractorLoaded = True
|
|
logger.info("Office extraction libraries successfully loaded")
|
|
except ImportError as e:
|
|
logger.warning(f"Office extraction libraries could not be loaded: {e}")
|
|
|
|
def _load_image_processor(self):
|
|
"""Loads image processing libraries when needed"""
|
|
global imageProcessorLoaded
|
|
if not imageProcessorLoaded:
|
|
try:
|
|
global PIL, Image
|
|
from PIL import Image
|
|
imageProcessorLoaded = True
|
|
logger.info("Image processing libraries successfully loaded")
|
|
except ImportError as e:
|
|
logger.warning(f"Image processing libraries could not be loaded: {e}")
|
|
|
|
def process_with_context(self, doc: Dict[str, Any], context: DocumentContext) -> ProcessedDocument:
|
|
"""Process document with context"""
|
|
try:
|
|
# Get content type
|
|
content_type = doc.get("contentType", "text/plain")
|
|
if content_type == "application/octet-stream":
|
|
# Try to detect actual file type
|
|
content_type = self._detect_content_type(doc)
|
|
|
|
if content_type not in self.supported_types:
|
|
# Fallback to binary processing
|
|
return self._process_binary(doc, context)
|
|
|
|
# Process document
|
|
processor = self.supported_types[content_type]
|
|
extracted = processor(doc, context)
|
|
|
|
# Track extraction
|
|
self._track_extraction(doc, extracted, context)
|
|
|
|
# Create ProcessedDocument
|
|
return ProcessedDocument(
|
|
id=doc.get("id", str(uuid.uuid4())),
|
|
name=doc.get("name", "Unknown"),
|
|
contentType=content_type,
|
|
content=extracted,
|
|
context=context
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing document: {str(e)}")
|
|
# Fallback to binary processing
|
|
return self._process_binary(doc, context)
|
|
|
|
def _detect_content_type(self, doc: Dict[str, Any]) -> str:
|
|
"""Detect content type from file content"""
|
|
try:
|
|
# Check file extension first
|
|
file_name = doc.get("name", "")
|
|
ext = os.path.splitext(file_name)[1].lower()
|
|
if ext:
|
|
# Map common extensions to MIME types
|
|
ext_to_mime = {
|
|
'.txt': 'text/plain',
|
|
'.md': 'text/markdown',
|
|
'.csv': 'text/csv',
|
|
'.json': 'application/json',
|
|
'.xml': 'application/xml',
|
|
'.js': 'application/javascript',
|
|
'.py': 'application/x-python',
|
|
'.svg': 'image/svg+xml',
|
|
'.jpg': 'image/jpeg',
|
|
'.jpeg': 'image/jpeg',
|
|
'.png': 'image/png',
|
|
'.gif': 'image/gif',
|
|
'.pdf': 'application/pdf',
|
|
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
'.doc': 'application/msword',
|
|
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
'.xls': 'application/vnd.ms-excel',
|
|
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
'.ppt': 'application/vnd.ms-powerpoint'
|
|
}
|
|
if ext in ext_to_mime:
|
|
return ext_to_mime[ext]
|
|
|
|
# Try to detect if it's text content
|
|
content = doc.get("content", "")
|
|
if isinstance(content, bytes):
|
|
try:
|
|
content.decode('utf-8')
|
|
return 'text/plain'
|
|
except UnicodeDecodeError:
|
|
pass
|
|
|
|
return 'application/octet-stream'
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error detecting content type: {str(e)}")
|
|
return 'application/octet-stream'
|
|
|
|
def _process_text(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
|
|
"""Process text document"""
|
|
content = doc.get("content", "")
|
|
if isinstance(content, bytes):
|
|
try:
|
|
content = content.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
# Try alternative encodings
|
|
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
|
|
try:
|
|
content = content.decode(encoding)
|
|
break
|
|
except UnicodeDecodeError:
|
|
continue
|
|
|
|
sections = self._extract_sections(content)
|
|
return DocumentContent(
|
|
text=content,
|
|
metadata=DocumentMetadata(
|
|
type="text",
|
|
format="text",
|
|
size=len(content.encode('utf-8')),
|
|
sections=sections
|
|
)
|
|
)
|
|
|
|
def _process_csv(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
|
|
"""Process CSV document"""
|
|
content = doc.get("content", "")
|
|
if isinstance(content, bytes):
|
|
content = content.decode('utf-8')
|
|
|
|
return DocumentContent(
|
|
text=content,
|
|
metadata=DocumentMetadata(
|
|
type="csv",
|
|
format="csv",
|
|
size=len(content.encode('utf-8')),
|
|
sections=[f"Row {i+1}" for i in range(len(content.splitlines()))]
|
|
)
|
|
)
|
|
|
|
def _process_json(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
|
|
"""Process JSON document"""
|
|
content = doc.get("content", {})
|
|
if isinstance(content, str):
|
|
content = json.loads(content)
|
|
elif isinstance(content, bytes):
|
|
content = json.loads(content.decode('utf-8'))
|
|
|
|
structure = self._analyze_structure(content)
|
|
return DocumentContent(
|
|
data=content,
|
|
metadata=DocumentMetadata(
|
|
type="json",
|
|
format="json",
|
|
size=len(json.dumps(content).encode('utf-8')),
|
|
sections=list(content.keys()) if isinstance(content, dict) else []
|
|
)
|
|
)
|
|
|
|
def _process_html(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
|
|
"""Process HTML document"""
|
|
content = doc.get("content", "")
|
|
if isinstance(content, bytes):
|
|
content = content.decode('utf-8')
|
|
|
|
return DocumentContent(
|
|
text=content,
|
|
metadata=DocumentMetadata(
|
|
type="html",
|
|
format="html",
|
|
size=len(content.encode('utf-8')),
|
|
sections=[
|
|
self._extract_title(content) or "Untitled",
|
|
*self._extract_links(content),
|
|
*self._extract_images(content)
|
|
]
|
|
)
|
|
)
|
|
|
|
def _process_svg(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
|
|
"""Process SVG document"""
|
|
content = doc.get("content", "")
|
|
if isinstance(content, bytes):
|
|
content = content.decode('utf-8')
|
|
|
|
# Check if it's actually SVG
|
|
is_svg = "<svg" in content.lower()
|
|
|
|
return DocumentContent(
|
|
text=content if is_svg else None,
|
|
metadata=DocumentMetadata(
|
|
type="svg",
|
|
format="svg",
|
|
size=len(content.encode('utf-8')),
|
|
error=None if is_svg else "Invalid SVG content"
|
|
)
|
|
)
|
|
|
|
def _process_image(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
|
|
"""Process image document"""
|
|
content = doc.get("content", b"")
|
|
if not isinstance(content, bytes):
|
|
try:
|
|
content = content.encode('utf-8')
|
|
except Exception as e:
|
|
logger.error(f"Error encoding image content: {str(e)}")
|
|
return DocumentContent(
|
|
metadata=DocumentMetadata(
|
|
type="image",
|
|
format="unknown",
|
|
size=0,
|
|
error=f"Invalid image content: {str(e)}"
|
|
)
|
|
)
|
|
|
|
metadata = DocumentMetadata(
|
|
type="image",
|
|
format=doc.get("contentType", "").split("/")[-1],
|
|
size=len(content)
|
|
)
|
|
|
|
try:
|
|
self._load_image_processor()
|
|
if imageProcessorLoaded:
|
|
with io.BytesIO(content) as img_stream:
|
|
img = Image.open(img_stream)
|
|
img.verify()
|
|
img_stream.seek(0)
|
|
img = Image.open(img_stream)
|
|
metadata.pages = 1
|
|
if hasattr(img, '_getexif') and callable(img._getexif):
|
|
exif = img._getexif()
|
|
if exif:
|
|
metadata.sections = [f"EXIF_{tag_id}" for tag_id in exif.keys()]
|
|
except Exception as e:
|
|
logger.warning(f"Error processing image: {str(e)}")
|
|
metadata.error = str(e)
|
|
|
|
try:
|
|
image_data = ImageData(
|
|
data=base64.b64encode(content).decode('utf-8'),
|
|
format=metadata.format,
|
|
page=None,
|
|
index=None
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error creating image data: {str(e)}")
|
|
return DocumentContent(
|
|
metadata=DocumentMetadata(
|
|
type="image",
|
|
format=metadata.format,
|
|
size=len(content),
|
|
error=f"Error creating image data: {str(e)}"
|
|
)
|
|
)
|
|
|
|
return DocumentContent(
|
|
images=[image_data],
|
|
metadata=metadata
|
|
)
|
|
|
|
def _process_pdf(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
|
|
"""Process PDF document"""
|
|
content = doc.get("content", b"")
|
|
if not isinstance(content, bytes):
|
|
content = content.encode('utf-8')
|
|
|
|
metadata = DocumentMetadata(
|
|
type="pdf",
|
|
format="pdf",
|
|
size=len(content)
|
|
)
|
|
|
|
text_content = ""
|
|
images: List[ImageData] = []
|
|
|
|
try:
|
|
self._load_pdf_extractor()
|
|
if pdfExtractorLoaded:
|
|
with io.BytesIO(content) as pdf_stream:
|
|
# Extract text with PyPDF2
|
|
pdf_reader = PyPDF2.PdfReader(pdf_stream)
|
|
metadata.pages = len(pdf_reader.pages)
|
|
|
|
# Extract text from all pages
|
|
for page_num in range(len(pdf_reader.pages)):
|
|
page = pdf_reader.pages[page_num]
|
|
page_text = page.extract_text()
|
|
if page_text:
|
|
text_content += f"--- Page {page_num + 1} ---\n{page_text}\n\n"
|
|
|
|
# Extract images with PyMuPDF
|
|
pdf_stream.seek(0)
|
|
doc = fitz.open(stream=pdf_stream, filetype="pdf")
|
|
for page_num in range(len(doc)):
|
|
page = doc[page_num]
|
|
for img_index, img_info in enumerate(page.get_images(full=True)):
|
|
try:
|
|
xref = img_info[0]
|
|
base_image = doc.extract_image(xref)
|
|
if base_image:
|
|
image_bytes = base_image.get("image", b"")
|
|
image_ext = base_image.get("ext", "png")
|
|
|
|
if image_bytes:
|
|
image_data = ImageData(
|
|
data=base64.b64encode(image_bytes).decode('utf-8'),
|
|
format=image_ext,
|
|
page=page_num + 1,
|
|
index=img_index
|
|
)
|
|
images.append(image_data)
|
|
except Exception as img_e:
|
|
logger.warning(f"Error extracting image {img_index} on page {page_num + 1}: {str(img_e)}")
|
|
|
|
doc.close()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing PDF: {str(e)}")
|
|
metadata.error = str(e)
|
|
|
|
return DocumentContent(
|
|
text=text_content,
|
|
images=images,
|
|
metadata=metadata
|
|
)
|
|
|
|
def _process_docx(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
|
|
"""Process Word document"""
|
|
content = doc.get("content", b"")
|
|
if not isinstance(content, bytes):
|
|
content = content.encode('utf-8')
|
|
|
|
metadata = DocumentMetadata(
|
|
type="docx",
|
|
format="docx",
|
|
size=len(content)
|
|
)
|
|
|
|
text_content = ""
|
|
|
|
try:
|
|
self._load_office_extractor()
|
|
if officeExtractorLoaded:
|
|
with io.BytesIO(content) as docx_stream:
|
|
doc = docx.Document(docx_stream)
|
|
|
|
# Extract text
|
|
full_text = []
|
|
for para in doc.paragraphs:
|
|
full_text.append(para.text)
|
|
|
|
# Extract tables
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
row_text = []
|
|
for cell in row.cells:
|
|
row_text.append(cell.text)
|
|
full_text.append(" | ".join(row_text))
|
|
|
|
text_content = "\n\n".join(full_text)
|
|
metadata.pages = len(doc.paragraphs)
|
|
metadata.sections = [f"Paragraph {i+1}" for i in range(len(doc.paragraphs))]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing Word document: {str(e)}")
|
|
metadata.error = str(e)
|
|
|
|
return DocumentContent(
|
|
text=text_content,
|
|
metadata=metadata
|
|
)
|
|
|
|
def _process_xlsx(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
|
|
"""Process Excel document"""
|
|
content = doc.get("content", b"")
|
|
if not isinstance(content, bytes):
|
|
content = content.encode('utf-8')
|
|
|
|
metadata = DocumentMetadata(
|
|
type="xlsx",
|
|
format="xlsx",
|
|
size=len(content)
|
|
)
|
|
|
|
sheets_data = []
|
|
|
|
try:
|
|
self._load_office_extractor()
|
|
if officeExtractorLoaded:
|
|
with io.BytesIO(content) as xlsx_stream:
|
|
workbook = openpyxl.load_workbook(xlsx_stream, data_only=True)
|
|
metadata.pages = len(workbook.sheetnames)
|
|
|
|
for sheet_name in workbook.sheetnames:
|
|
sheet = workbook[sheet_name]
|
|
csv_rows = []
|
|
for row in sheet.iter_rows():
|
|
csv_row = []
|
|
for cell in row:
|
|
value = cell.value
|
|
if value is None:
|
|
csv_row.append("")
|
|
else:
|
|
csv_row.append(str(value).replace('"', '""'))
|
|
csv_rows.append(','.join(f'"{cell}"' for cell in csv_row))
|
|
|
|
sheets_data.append({
|
|
"name": sheet_name,
|
|
"data": "\n".join(csv_rows)
|
|
})
|
|
|
|
metadata.sections = workbook.sheetnames
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing Excel document: {str(e)}")
|
|
metadata.error = str(e)
|
|
|
|
return DocumentContent(
|
|
data={"sheets": sheets_data},
|
|
metadata=metadata
|
|
)
|
|
|
|
def _process_pptx(self, doc: Dict[str, Any], context: DocumentContext) -> DocumentContent:
|
|
"""Process PowerPoint document"""
|
|
return DocumentContent(
|
|
metadata=DocumentMetadata(
|
|
type="pptx",
|
|
format="pptx",
|
|
size=len(doc.get("content", b"")),
|
|
error="PowerPoint processing not implemented"
|
|
)
|
|
)
|
|
|
|
def _process_binary(self, doc: Dict[str, Any], context: DocumentContext) -> ProcessedDocument:
|
|
"""Process binary document"""
|
|
content = doc.get("content", b"")
|
|
if not isinstance(content, bytes):
|
|
content = content.encode('utf-8')
|
|
|
|
return ProcessedDocument(
|
|
id=doc.get("id", str(uuid.uuid4())),
|
|
name=doc.get("name", "Unknown"),
|
|
contentType="application/octet-stream",
|
|
content=DocumentContent(
|
|
data={"binary": base64.b64encode(content).decode('utf-8')},
|
|
metadata=DocumentMetadata(
|
|
type="binary",
|
|
format="binary",
|
|
size=len(content)
|
|
)
|
|
),
|
|
context=context
|
|
)
|
|
|
|
def _extract_sections(self, content: str) -> List[str]:
|
|
"""Extract sections from text content"""
|
|
sections = []
|
|
current_section = []
|
|
|
|
for line in content.split("\n"):
|
|
if line.startswith("#"):
|
|
if current_section:
|
|
sections.append("".join(current_section))
|
|
current_section = []
|
|
current_section.append(line)
|
|
|
|
if current_section:
|
|
sections.append("".join(current_section))
|
|
|
|
return sections
|
|
|
|
def _analyze_structure(self, content: Any) -> Dict[str, Any]:
|
|
"""Analyze structure of JSON content"""
|
|
if isinstance(content, dict):
|
|
return {
|
|
"type": "object",
|
|
"properties": {
|
|
k: self._analyze_structure(v)
|
|
for k, v in content.items()
|
|
}
|
|
}
|
|
elif isinstance(content, list):
|
|
return {
|
|
"type": "array",
|
|
"items": self._analyze_structure(content[0]) if content else {}
|
|
}
|
|
else:
|
|
return {
|
|
"type": type(content).__name__
|
|
}
|
|
|
|
def _extract_title(self, html: str) -> Optional[str]:
|
|
"""Extract title from HTML"""
|
|
match = re.search(r"<title>(.*?)</title>", html, re.IGNORECASE)
|
|
return match.group(1) if match else None
|
|
|
|
def _extract_links(self, html: str) -> List[str]:
|
|
"""Extract links from HTML"""
|
|
return re.findall(r'href=[\'"]?([^\'" >]+)', html)
|
|
|
|
def _extract_images(self, html: str) -> List[str]:
|
|
"""Extract images from HTML"""
|
|
return re.findall(r'src=[\'"]?([^\'" >]+)', html)
|
|
|
|
def _track_extraction(self, doc: Dict[str, Any], extracted: DocumentContent, context: DocumentContext) -> None:
|
|
"""Track document extraction"""
|
|
extraction = DocumentExtraction(
|
|
timestamp=datetime.now(UTC).isoformat(),
|
|
type=doc.get("contentType", "unknown"),
|
|
sections=extracted.metadata.sections or [],
|
|
metadata=extracted.metadata.dict()
|
|
)
|
|
context.extractionHistory.append(extraction)
|
|
|
|
def get_supported_types(self) -> List[str]:
|
|
"""Get list of supported content types"""
|
|
return list(self.supported_types.keys())
|
|
|
|
def add_processor(self, content_type: str, processor: callable) -> None:
|
|
"""Add new document processor"""
|
|
self.supported_types[content_type] = processor
|
|
|
|
def remove_processor(self, content_type: str) -> None:
|
|
"""Remove document processor"""
|
|
self.supported_types.pop(content_type, None) |