gateway/modules/documentProcessor.py
2025-04-26 02:13:22 +02:00

887 lines
No EOL
32 KiB
Python

"""
Module for extracting content from various file formats.
Provides specialized functions for processing text, PDF, Office documents, images, etc.
"""
import logging
import os
import io
from typing import Dict, Any, List, Optional, Union, Tuple
import base64
# Configure logger
logger = logging.getLogger(__name__)
# Optional imports - only loaded when needed
pdfExtractorLoaded = False
officeExtractorLoaded = False
imageProcessorLoaded = False
def getDocumentContents(fileMetadata: Dict[str, Any], fileContent: bytes) -> List[Dict[str, Any]]:
"""
Main function for extracting content from a file based on its MIME type.
Delegates to specialized extraction functions.
Args:
fileMetadata: File metadata (Name, MIME type, etc.)
fileContent: Binary data of the file
Returns:
List of Document-Content objects with metadata and isText flag
"""
try:
mimeType = fileMetadata.get("mimeType", "application/octet-stream")
fileName = fileMetadata.get("name", "unknown")
logger.info(f"Extracting content from file '{fileName}' (MIME type: {mimeType})")
# Extract content based on MIME type
contents = []
# Text-based formats
if mimeType.startswith("text/") or mimeType in [
"application/json",
"application/xml",
"application/javascript",
"application/x-python"
]:
contents.extend(extractTextContent(fileName, fileContent, mimeType))
# CSV Format
elif mimeType == "text/csv":
contents.extend(extractCsvContent(fileName, fileContent))
# SVG Files
elif mimeType == "image/svg+xml":
contents.extend(extractSvgContent(fileName, fileContent))
# Images
elif mimeType.startswith("image/"):
contents.extend(extractImageContent(fileName, fileContent, mimeType))
# PDF Documents
elif mimeType == "application/pdf":
contents.extend(extractPdfContent(fileName, fileContent))
# Word Documents
elif mimeType in [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/msword"
]:
contents.extend(extractWordContent(fileName, fileContent, mimeType))
# Excel Documents
elif mimeType in [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-excel"
]:
contents.extend(extractExcelContent(fileName, fileContent, mimeType))
# PowerPoint Documents
elif mimeType in [
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint"
]:
contents.extend(extractPowerpointContent(fileName, fileContent, mimeType))
# Binary data as fallback for unknown formats
else:
contents.extend(extractBinaryContent(fileName, fileContent, mimeType))
# Fallback when no content could be extracted
if not contents:
logger.warning(f"No content extracted from file '{fileName}', using binary fallback")
contents.append({
"sequenceNr": 1,
"name": '1_undefined',
"ext": os.path.splitext(fileName)[1][1:] if os.path.splitext(fileName)[1] else "bin",
"contentType": mimeType,
"data": fileContent,
"metadata": {
"isText": False
}
})
# Add generic attributes for all documents
for content in contents:
if isinstance(content.get("data"), bytes):
content["data"] = base64.b64encode(content["data"]).decode('utf-8')
# Add base64 flag
if "metadata" not in content:
content["metadata"] = {}
content["metadata"]["base64Encoded"] = True
logger.info(f"Successfully extracted {len(contents)} content items from file '{fileName}'")
return contents
except Exception as e:
logger.error(f"Error during content extraction: {str(e)}")
# Fallback on error - return original data
return [{
"sequenceNr": 1,
"name": fileMetadata.get("name", "unknown"),
"ext": os.path.splitext(fileMetadata.get("name", ""))[1][1:] if os.path.splitext(fileMetadata.get("name", ""))[1] else "bin",
"contentType": fileMetadata.get("mimeType", "application/octet-stream"),
"data": fileContent,
"metadata": {
"isText": False
}
}]
def _loadPdfExtractor():
"""Loads PDF extraction libraries when needed"""
global pdfExtractorLoaded
if not pdfExtractorLoaded:
try:
global PyPDF2, fitz
import PyPDF2
import fitz # PyMuPDF for more extensive PDF processing
pdfExtractorLoaded = True
logger.info("PDF extraction libraries successfully loaded")
except ImportError as e:
logger.warning(f"PDF extraction libraries could not be loaded: {e}")
def _loadOfficeExtractor():
"""Loads Office document extraction libraries when needed"""
global officeExtractorLoaded
if not officeExtractorLoaded:
try:
global docx, openpyxl
import docx # python-docx for Word documents
import openpyxl # for Excel files
officeExtractorLoaded = True
logger.info("Office extraction libraries successfully loaded")
except ImportError as e:
logger.warning(f"Office extraction libraries could not be loaded: {e}")
def _loadImageProcessor():
"""Loads image processing libraries when needed"""
global imageProcessorLoaded
if not imageProcessorLoaded:
try:
global PIL, Image
from PIL import Image
imageProcessorLoaded = True
logger.info("Image processing libraries successfully loaded")
except ImportError as e:
logger.warning(f"Image processing libraries could not be loaded: {e}")
def extractTextContent(fileName: str, fileContent: bytes, mimeType: str) -> List[Dict[str, Any]]:
"""
Extracts text from text files.
Args:
fileName: Name of the file
fileContent: Binary data of the file
mimeType: MIME type of the file
Returns:
List of Text-Content objects with metadata.isText = True
"""
try:
# Keep original file extension
fileExtension = os.path.splitext(fileName)[1][1:] if os.path.splitext(fileName)[1] else "txt"
# Extract text content
textContent = fileContent.decode('utf-8')
return [{
"sequenceNr": 1,
"name": "1_text", # Simplified naming
"ext": fileExtension,
"contentType": "text",
"data": textContent,
"metadata": {
"isText": True
}
}]
except UnicodeDecodeError:
logger.warning(f"Could not decode text from file '{fileName}' as UTF-8, trying alternative encodings")
try:
# Try alternative encodings
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
try:
textContent = fileContent.decode(encoding)
logger.info(f"Text successfully decoded with encoding {encoding}")
return [{
"sequenceNr": 1,
"name": "1_text", # Simplified naming
"ext": fileExtension,
"contentType": "text",
"data": textContent,
"metadata": {
"isText": True,
"encoding": encoding
}
}]
except UnicodeDecodeError:
continue
# Fallback to binary data if no encoding works
logger.warning(f"Could not decode text, using binary data")
return [{
"sequenceNr": 1,
"name": "1_binary", # Simplified naming
"ext": fileExtension,
"contentType": mimeType,
"data": fileContent,
"metadata": {
"isText": False
}
}]
except Exception as e:
logger.error(f"Error in alternative text decoding: {str(e)}")
# Return binary data as fallback
return [{
"sequenceNr": 1,
"name": "1_binary", # Simplified naming
"ext": fileExtension,
"contentType": mimeType,
"data": fileContent,
"metadata": {
"isText": False
}
}]
def extractCsvContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]:
"""
Extracts content from CSV files.
Args:
fileName: Name of the file
fileContent: Binary data of the file
Returns:
List of CSV-Content objects with metadata.isText = True
"""
try:
# Extract text content
csvContent = fileContent.decode('utf-8')
return [{
"sequenceNr": 1,
"name": "1_csv", # Simplified naming
"ext": "csv",
"contentType": "csv",
"data": csvContent,
"metadata": {
"isText": True,
"format": "csv"
}
}]
except UnicodeDecodeError:
logger.warning(f"Could not decode CSV from file '{fileName}' as UTF-8, trying alternative encodings")
try:
# Try alternative encodings for CSV
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
try:
csvContent = fileContent.decode(encoding)
logger.info(f"CSV successfully decoded with encoding {encoding}")
return [{
"sequenceNr": 1,
"name": "1_csv", # Simplified naming
"ext": "csv",
"contentType": "csv",
"data": csvContent,
"metadata": {
"isText": True,
"encoding": encoding,
"format": "csv"
}
}]
except UnicodeDecodeError:
continue
# Fallback to binary data
return [{
"sequenceNr": 1,
"name": "1_binary", # Simplified naming
"ext": "csv",
"contentType": "text/csv",
"data": fileContent,
"metadata": {
"isText": False
}
}]
except Exception as e:
logger.error(f"Error in alternative CSV decoding: {str(e)}")
return [{
"sequenceNr": 1,
"name": "1_binary", # Simplified naming
"ext": "csv",
"contentType": "text/csv",
"data": fileContent,
"metadata": {
"isText": False
}
}]
def extractSvgContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]:
"""
Extracts content from SVG files.
Args:
fileName: Name of the file
fileContent: Binary data of the file
Returns:
List of SVG-Content objects with dual text/image metadata
"""
contents = []
try:
# Extract SVG as text content (XML)
svgText = fileContent.decode('utf-8')
# Check if it's actually SVG by looking for the SVG tag
if "<svg" in svgText.lower():
# SVG is both text (XML) and an image
contents.append({
"sequenceNr": 1,
"name": "1_svg", # Simplified naming
"ext": "svg",
"contentType": "image/svg+xml",
"data": svgText,
"metadata": {
"isText": True, # SVG is text-based (XML)
"format": "svg",
"isImage": True # But also represents an image
}
})
else:
# Doesn't appear to be a valid SVG file
logger.warning(f"File '{fileName}' has SVG extension but does not contain SVG markup")
contents.append({
"sequenceNr": 1,
"name": "1_text",
"ext": "svg",
"contentType": "text/plain",
"data": svgText,
"metadata": {
"isText": True,
"format": "text"
}
})
except UnicodeDecodeError:
logger.warning(f"Could not decode SVG from file '{fileName}' as UTF-8, trying alternative encodings")
try:
# Try alternative encodings
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
try:
svgText = fileContent.decode(encoding)
if "<svg" in svgText.lower():
logger.info(f"SVG successfully decoded with encoding {encoding}")
contents.append({
"sequenceNr": 1,
"name": "1_svg", # Simplified naming
"ext": "svg",
"contentType": "image/svg+xml",
"data": svgText,
"metadata": {
"isText": True,
"format": "svg",
"isImage": True,
"encoding": encoding
}
})
break
except UnicodeDecodeError:
continue
# Fallback to binary data if no encoding works
if not contents:
logger.warning(f"Could not decode SVG text, using binary data")
contents.append({
"sequenceNr": 1,
"name": "1_binary", # Simplified naming
"ext": "svg",
"contentType": "image/svg+xml",
"data": fileContent,
"metadata": {
"isText": False,
"format": "svg",
"isImage": True
}
})
except Exception as e:
logger.error(f"Error in alternative SVG decoding: {str(e)}")
# Return binary data as fallback
contents.append({
"sequenceNr": 1,
"name": "1_binary", # Simplified naming
"ext": "svg",
"contentType": "image/svg+xml",
"data": fileContent,
"metadata": {
"isText": False,
"format": "svg",
"isImage": True
}
})
return contents
def extractImageContent(fileName: str, fileContent: bytes, mimeType: str) -> List[Dict[str, Any]]:
"""
Extracts content from image files and optionally generates metadata descriptions.
Args:
fileName: Name of the file
fileContent: Binary data of the file
mimeType: MIME type of the file
Returns:
List of Image-Content objects with metadata.isText = False
"""
# Extract file extension from MIME type or filename
fileExtension = mimeType.split('/')[-1]
if fileExtension == "jpeg":
fileExtension = "jpg"
# If possible, analyze image and extract metadata
imageMetadata = {
"isText": False,
"format": "image"
}
imageDescription = None
try:
_loadImageProcessor()
if imageProcessorLoaded and fileContent and len(fileContent) > 0:
with io.BytesIO(fileContent) as imgStream:
try:
img = Image.open(imgStream)
# Check if the image was actually loaded
img.verify()
# To safely continue working, reload
imgStream.seek(0)
img = Image.open(imgStream)
imageMetadata.update({
"format": img.format,
"mode": img.mode,
"width": img.width,
"height": img.height
})
# Extract EXIF data if available
if hasattr(img, '_getexif') and callable(img._getexif):
exif = img._getexif()
if exif:
exifData = {}
for tagId, value in exif.items():
exifData[f"tag_{tagId}"] = str(value)
imageMetadata["exif"] = exifData
# Generate image description
imageDescription = f"Image ({img.width}x{img.height}, {img.format}, {img.mode})"
except Exception as innerE:
logger.warning(f"Error processing image: {str(innerE)}")
imageMetadata["error"] = str(innerE)
imageDescription = f"Image (unable to process: {str(innerE)})"
except Exception as e:
logger.warning(f"Could not extract image metadata: {str(e)}")
imageMetadata["error"] = str(e)
# Return image content
contents = [{
"sequenceNr": 1,
"name": "1_image", # Simplified naming
"ext": fileExtension,
"contentType": "image",
"data": fileContent,
"metadata": imageMetadata
}]
# If image description available, add as additional text content
if imageDescription:
contents.append({
"sequenceNr": 2,
"name": "2_text_image_info", # Simplified naming with label
"ext": "txt",
"contentType": "text",
"data": imageDescription,
"metadata": {
"isText": True,
"imageDescription": True
}
})
return contents
def extractPdfContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]:
"""
Extracts text and images from PDF files.
Args:
fileName: Name of the file
fileContent: Binary data of the file
Returns:
List of PDF-Content objects (text and images) with metadata.isText flag
"""
contents = []
extractedContentFound = False
try:
# Load PDF extraction libraries
_loadPdfExtractor()
if not pdfExtractorLoaded:
logger.warning("PDF extraction not possible: Libraries not available")
# Add original file as binary content
contents.append({
"sequenceNr": 1,
"name": "1_pdf", # Simplified naming
"ext": "pdf",
"contentType": "application/pdf",
"data": fileContent,
"metadata": {
"isText": False,
"format": "pdf"
}
})
return contents
# Extract text with PyPDF2
extractedText = ""
pdfMetadata = {}
with io.BytesIO(fileContent) as pdfStream:
pdfReader = PyPDF2.PdfReader(pdfStream)
# Extract metadata
pdfInfo = pdfReader.metadata or {}
for key, value in pdfInfo.items():
if key.startswith('/'):
pdfMetadata[key[1:]] = value
else:
pdfMetadata[key] = value
# Extract text from all pages
for pageNum in range(len(pdfReader.pages)):
page = pdfReader.pages[pageNum]
pageText = page.extract_text()
if pageText:
extractedText += f"--- Page {pageNum + 1} ---\n{pageText}\n\n"
# If text was found, add as separate content
if extractedText.strip():
extractedContentFound = True
contents.append({
"sequenceNr": len(contents) + 1,
"name": f"{len(contents) + 1}_text", # Simplified naming
"ext": "txt",
"contentType": "text",
"data": extractedText,
"metadata": {
"isText": True,
"source": "pdf",
"pages": len(pdfReader.pages),
"pdfMetadata": pdfMetadata
}
})
# Extract images with PyMuPDF (fitz)
try:
with io.BytesIO(fileContent) as pdfStream:
doc = fitz.open(stream=pdfStream, filetype="pdf")
imageCount = 0
for pageNum in range(len(doc)):
page = doc[pageNum]
imageList = page.get_images(full=True)
for imgIndex, imgInfo in enumerate(imageList):
try:
imageCount += 1
xref = imgInfo[0]
baseImage = doc.extract_image(xref)
imageBytes = baseImage["image"]
imageExt = baseImage["ext"]
# Add image as content
extractedContentFound = True
contents.append({
"sequenceNr": len(contents) + 1,
"name": f"{len(contents) + 1}_image_page{pageNum+1}_{imgIndex+1}", # Simplified naming with label
"ext": imageExt,
"contentType": f"image/{imageExt}",
"data": imageBytes,
"metadata": {
"isText": False,
"source": "pdf",
"page": pageNum + 1,
"index": imgIndex
}
})
except Exception as imgE:
logger.warning(f"Error extracting image {imgIndex} on page {pageNum + 1}: {str(imgE)}")
# Close document
doc.close()
except Exception as imgExtractE:
logger.warning(f"Error extracting images from PDF: {str(imgExtractE)}")
except Exception as e:
logger.error(f"Error in PDF extraction: {str(e)}")
# If no content was extracted, add the original PDF
if not extractedContentFound:
contents.append({
"sequenceNr": 1,
"name": "1_pdf", # Simplified naming
"ext": "pdf",
"contentType": "application/pdf",
"data": fileContent,
"metadata": {
"isText": False,
"format": "pdf"
}
})
return contents
def extractWordContent(fileName: str, fileContent: bytes, mimeType: str) -> List[Dict[str, Any]]:
"""
Extracts text and images from Word documents.
Args:
fileName: Name of the file
fileContent: Binary data of the file
mimeType: MIME type of the file
Returns:
List of Word-Content objects (text and possibly images) with metadata.isText flag
"""
contents = []
extractedContentFound = False
# Determine file extension
fileExtension = "docx" if mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" else "doc"
try:
# Load Office extraction libraries
_loadOfficeExtractor()
if not officeExtractorLoaded:
logger.warning("Word extraction not possible: Libraries not available")
# Add original file as binary content
contents.append({
"sequenceNr": 1,
"name": "1_word", # Simplified naming
"ext": fileExtension,
"contentType": mimeType,
"data": fileContent,
"metadata": {
"isText": False,
"format": "word"
}
})
return contents
# Only supports DOCX (newer format)
if mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
with io.BytesIO(fileContent) as docxStream:
doc = docx.Document(docxStream)
# Extract text
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
# Extract tables
for table in doc.tables:
for row in table.rows:
rowText = []
for cell in row.cells:
rowText.append(cell.text)
fullText.append(" | ".join(rowText))
extractedText = "\n\n".join(fullText)
# Add extracted text as content
if extractedText.strip():
extractedContentFound = True
contents.append({
"sequenceNr": 1,
"name": "1_text", # Simplified naming
"ext": "txt",
"contentType": "text",
"data": extractedText,
"metadata": {
"isText": True,
"source": "docx",
"paragraphCount": len(doc.paragraphs),
"tableCount": len(doc.tables)
}
})
else:
logger.warning(f"Extraction from old Word format (DOC) not supported")
except Exception as e:
logger.error(f"Error in Word extraction: {str(e)}")
# If no content was extracted, add the original document
if not extractedContentFound:
contents.append({
"sequenceNr": 1,
"name": "1_word", # Simplified naming
"ext": fileExtension,
"contentType": mimeType,
"data": fileContent,
"metadata": {
"isText": False,
"format": "word"
}
})
return contents
def extractExcelContent(fileName: str, fileContent: bytes, mimeType: str) -> List[Dict[str, Any]]:
"""
Extracts table data from Excel files.
Args:
fileName: Name of the file
fileContent: Binary data of the file
mimeType: MIME type of the file
Returns:
List of Excel-Content objects with metadata.isText flag
"""
contents = []
extractedContentFound = False
# Determine file extension
fileExtension = "xlsx" if mimeType == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" else "xls"
try:
# Load Office extraction libraries
_loadOfficeExtractor()
if not officeExtractorLoaded:
logger.warning("Excel extraction not possible: Libraries not available")
# Add original file as binary content
contents.append({
"sequenceNr": 1,
"name": "1_excel", # Simplified naming
"ext": fileExtension,
"contentType": mimeType,
"data": fileContent,
"metadata": {
"isText": False,
"format": "excel"
}
})
return contents
# Only supports XLSX (newer format)
if mimeType == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
with io.BytesIO(fileContent) as xlsxStream:
workbook = openpyxl.load_workbook(xlsxStream, data_only=True)
# Extract each worksheet as separate CSV content
for sheetIndex, sheetName in enumerate(workbook.sheetnames):
sheet = workbook[sheetName]
# Format data as CSV
csvRows = []
for row in sheet.iter_rows():
csvRow = []
for cell in row:
value = cell.value
if value is None:
csvRow.append("")
else:
csvRow.append(str(value).replace('"', '""'))
csvRows.append(','.join(f'"{cell}"' for cell in csvRow))
csvContent = "\n".join(csvRows)
# Add as CSV content
if csvContent.strip():
extractedContentFound = True
sheetSafeName = sheetName.replace(" ", "_").replace("/", "_").replace("\\", "_")
contents.append({
"sequenceNr": len(contents) + 1,
"name": f"{len(contents) + 1}_csv_{sheetSafeName}", # Simplified naming with sheet label
"ext": "csv",
"contentType": "csv",
"data": csvContent,
"metadata": {
"isText": True,
"source": "xlsx",
"sheet": sheetName,
"format": "csv"
}
})
else:
logger.warning(f"Extraction from old Excel format (XLS) not supported")
except Exception as e:
logger.error(f"Error in Excel extraction: {str(e)}")
# If no content was extracted, add the original document
if not extractedContentFound:
contents.append({
"sequenceNr": 1,
"name": "1_excel", # Simplified naming
"ext": fileExtension,
"contentType": mimeType,
"data": fileContent,
"metadata": {
"isText": False,
"format": "excel"
}
})
return contents
def extractPowerpointContent(fileName: str, fileContent: bytes, mimeType: str) -> List[Dict[str, Any]]:
"""
Extracts content from PowerPoint presentations.
Args:
fileName: Name of the file
fileContent: Binary data of the file
mimeType: MIME type of the file
Returns:
List of PowerPoint-Content objects with metadata.isText = False
"""
# For PowerPoint, we currently only return the original binary file
# A complete extraction would require more specialized libraries
fileExtension = "pptx" if mimeType == "application/vnd.openxmlformats-officedocument.presentationml.presentation" else "ppt"
return [{
"sequenceNr": 1,
"name": "1_powerpoint", # Simplified naming
"ext": fileExtension,
"contentType": mimeType,
"data": fileContent,
"metadata": {
"isText": False,
"format": "powerpoint"
}
}]
def extractBinaryContent(fileName: str, fileContent: bytes, mimeType: str) -> List[Dict[str, Any]]:
"""
Fallback for binary files where no specific extraction is possible.
Args:
fileName: Name of the file
fileContent: Binary data of the file
mimeType: MIME type of the file
Returns:
List with a binary Content object with metadata.isText = False
"""
fileExtension = os.path.splitext(fileName)[1][1:] if os.path.splitext(fileName)[1] else "bin"
return [{
"sequenceNr": 1,
"name": "1_binary", # Simplified naming
"ext": fileExtension,
"contentType": mimeType,
"data": fileContent,
"metadata": {
"isText": False,
"format": "binary"
}
}]