1134 lines
No EOL
43 KiB
Python
1134 lines
No EOL
43 KiB
Python
"""
|
|
Module for extracting content from various file formats.
|
|
Provides specialized functions for processing text, PDF, Office documents, images, etc.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import io
|
|
from typing import Dict, Any, List, Optional, Union, Tuple
|
|
import base64
|
|
|
|
# Configure logger
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Optional imports - only loaded when needed
|
|
pdfExtractorLoaded = False
|
|
officeExtractorLoaded = False
|
|
imageProcessorLoaded = False
|
|
|
|
class FileProcessingError(Exception):
|
|
"""Custom exception for file processing errors."""
|
|
pass
|
|
|
|
def getDocumentContents(fileMetadata: Dict[str, Any], fileContent: bytes) -> List[Dict[str, Any]]:
|
|
"""
|
|
Main function for extracting content from a file based on its MIME type.
|
|
Delegates to specialized extraction functions.
|
|
|
|
Args:
|
|
fileMetadata: File metadata (Name, MIME type, etc.)
|
|
fileContent: Binary data of the file
|
|
|
|
Returns:
|
|
List of Document-Content objects with metadata and base64Encoded flag
|
|
"""
|
|
try:
|
|
mimeType = fileMetadata.get("mimeType", "application/octet-stream")
|
|
fileName = fileMetadata.get("name", "unknown")
|
|
|
|
logger.info(f"Extracting content from file '{fileName}' (MIME type: {mimeType})")
|
|
|
|
# Extract content based on MIME type
|
|
contents = []
|
|
|
|
# Try to detect actual file type from content for unknown MIME types
|
|
if mimeType == "application/octet-stream":
|
|
# Check file extension first
|
|
ext = os.path.splitext(fileName)[1].lower()
|
|
if ext:
|
|
# Map common extensions to MIME types
|
|
ext_to_mime = {
|
|
'.txt': 'text/plain',
|
|
'.md': 'text/markdown',
|
|
'.csv': 'text/csv',
|
|
'.json': 'application/json',
|
|
'.xml': 'application/xml',
|
|
'.js': 'application/javascript',
|
|
'.py': 'application/x-python',
|
|
'.svg': 'image/svg+xml',
|
|
'.jpg': 'image/jpeg',
|
|
'.jpeg': 'image/jpeg',
|
|
'.png': 'image/png',
|
|
'.gif': 'image/gif',
|
|
'.pdf': 'application/pdf',
|
|
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
'.doc': 'application/msword',
|
|
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
'.xls': 'application/vnd.ms-excel',
|
|
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
'.ppt': 'application/vnd.ms-powerpoint'
|
|
}
|
|
if ext in ext_to_mime:
|
|
mimeType = ext_to_mime[ext]
|
|
logger.info(f"Detected MIME type {mimeType} from extension {ext}")
|
|
else:
|
|
logger.warning(f"Unknown file extension {ext} for file {fileName}")
|
|
|
|
# Try to detect if it's text content
|
|
try:
|
|
text_content = fileContent.decode('utf-8')
|
|
logger.info(f"Successfully decoded file {fileName} as text")
|
|
contents.extend(extractTextContent(fileName, fileContent, "text/plain"))
|
|
except UnicodeDecodeError:
|
|
logger.info(f"File {fileName} is not text, treating as binary")
|
|
contents.extend(extractBinaryContent(fileName, fileContent, mimeType))
|
|
|
|
# Text-based formats (excluding CSV which has its own handler)
|
|
elif mimeType == "text/csv":
|
|
contents.extend(extractCsvContent(fileName, fileContent))
|
|
|
|
# Then handle other text-based formats
|
|
elif mimeType.startswith("text/") or mimeType in [
|
|
"application/json",
|
|
"application/xml",
|
|
"application/javascript",
|
|
"application/x-python"
|
|
]:
|
|
contents.extend(extractTextContent(fileName, fileContent, mimeType))
|
|
|
|
# SVG Files
|
|
elif mimeType == "image/svg+xml":
|
|
contents.extend(extractSvgContent(fileName, fileContent))
|
|
|
|
# Images
|
|
elif mimeType.startswith("image/"):
|
|
contents.extend(extractImageContent(fileName, fileContent, mimeType))
|
|
|
|
# PDF Documents
|
|
elif mimeType == "application/pdf":
|
|
contents.extend(extractPdfContent(fileName, fileContent))
|
|
|
|
# Word Documents
|
|
elif mimeType in [
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
"application/msword"
|
|
]:
|
|
contents.extend(extractWordContent(fileName, fileContent, mimeType))
|
|
|
|
# Excel Documents
|
|
elif mimeType in [
|
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
"application/vnd.ms-excel"
|
|
]:
|
|
contents.extend(extractExcelContent(fileName, fileContent, mimeType))
|
|
|
|
# PowerPoint Documents
|
|
elif mimeType in [
|
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
"application/vnd.ms-powerpoint"
|
|
]:
|
|
contents.extend(extractPowerpointContent(fileName, fileContent, mimeType))
|
|
|
|
# Binary data as fallback for unknown formats
|
|
else:
|
|
logger.warning(f"Unknown MIME type {mimeType} for file {fileName}, treating as binary")
|
|
contents.extend(extractBinaryContent(fileName, fileContent, mimeType))
|
|
|
|
# Fallback when no content could be extracted
|
|
if not contents:
|
|
logger.warning(f"No content extracted from file '{fileName}', using binary fallback")
|
|
|
|
# Convert binary content to base64
|
|
encoded_data = base64.b64encode(fileContent).decode('utf-8')
|
|
|
|
contents.append({
|
|
"sequenceNr": 1,
|
|
"name": '1_undefined',
|
|
"ext": os.path.splitext(fileName)[1][1:] if os.path.splitext(fileName)[1] else "bin",
|
|
"mimeType": mimeType,
|
|
"data": encoded_data,
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False
|
|
}
|
|
})
|
|
|
|
# Add generic attributes for all documents
|
|
for content in contents:
|
|
# Make sure all content items have the base64Encoded flag
|
|
if "base64Encoded" not in content:
|
|
if isinstance(content.get("data"), bytes):
|
|
# Convert bytes to base64
|
|
content["data"] = base64.b64encode(content["data"]).decode('utf-8')
|
|
content["base64Encoded"] = True
|
|
else:
|
|
# Assume text content if not explicitly marked
|
|
content["base64Encoded"] = False
|
|
|
|
# Maintain backward compatibility with old "base64Encoded" flag in metadata
|
|
if "metadata" not in content:
|
|
content["metadata"] = {}
|
|
|
|
# Set base64Encoded in metadata for backward compatibility
|
|
content["metadata"]["base64Encoded"] = content["base64Encoded"]
|
|
|
|
logger.info(f"Successfully extracted {len(contents)} content items from file '{fileName}'")
|
|
return contents
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during content extraction for file {fileMetadata.get('name', 'unknown')}: {str(e)}", exc_info=True)
|
|
# Fallback on error - return original data
|
|
return [{
|
|
"sequenceNr": 1,
|
|
"name": fileMetadata.get("name", "unknown"),
|
|
"ext": os.path.splitext(fileMetadata.get("name", ""))[1][1:] if os.path.splitext(fileMetadata.get("name", ""))[1] else "bin",
|
|
"mimeType": fileMetadata.get("mimeType", "application/octet-stream"),
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False,
|
|
"base64Encoded": True # For backward compatibility
|
|
}
|
|
}]
|
|
|
|
|
|
def _loadPdfExtractor():
|
|
"""Loads PDF extraction libraries when needed"""
|
|
global pdfExtractorLoaded
|
|
if not pdfExtractorLoaded:
|
|
try:
|
|
global PyPDF2, fitz
|
|
import PyPDF2
|
|
import fitz # PyMuPDF for more extensive PDF processing
|
|
pdfExtractorLoaded = True
|
|
logger.info("PDF extraction libraries successfully loaded")
|
|
except ImportError as e:
|
|
logger.warning(f"PDF extraction libraries could not be loaded: {e}")
|
|
|
|
def _loadOfficeExtractor():
|
|
"""Loads Office document extraction libraries when needed"""
|
|
global officeExtractorLoaded
|
|
if not officeExtractorLoaded:
|
|
try:
|
|
global docx, openpyxl
|
|
import docx # python-docx for Word documents
|
|
import openpyxl # for Excel files
|
|
officeExtractorLoaded = True
|
|
logger.info("Office extraction libraries successfully loaded")
|
|
except ImportError as e:
|
|
logger.warning(f"Office extraction libraries could not be loaded: {e}")
|
|
|
|
def _loadImageProcessor():
|
|
"""Loads image processing libraries when needed"""
|
|
global imageProcessorLoaded
|
|
if not imageProcessorLoaded:
|
|
try:
|
|
global PIL, Image
|
|
from PIL import Image
|
|
imageProcessorLoaded = True
|
|
logger.info("Image processing libraries successfully loaded")
|
|
except ImportError as e:
|
|
logger.warning(f"Image processing libraries could not be loaded: {e}")
|
|
|
|
def extractTextContent(fileName: str, fileContent: bytes, mimeType: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts text from text files.
|
|
|
|
Args:
|
|
fileName: Name of the file
|
|
fileContent: Binary data of the file
|
|
mimeType: MIME type of the file
|
|
|
|
Returns:
|
|
List of Text-Content objects with base64Encoded = False
|
|
"""
|
|
try:
|
|
# Keep original file extension
|
|
fileExtension = os.path.splitext(fileName)[1][1:] if os.path.splitext(fileName)[1] else "txt"
|
|
|
|
# Extract text content
|
|
textContent = fileContent.decode('utf-8')
|
|
return [{
|
|
"sequenceNr": 1,
|
|
"name": "1_text", # Simplified naming
|
|
"ext": fileExtension,
|
|
"mimeType": "text/plain",
|
|
"data": textContent,
|
|
"base64Encoded": False,
|
|
"metadata": {
|
|
"isText": True
|
|
}
|
|
}]
|
|
except UnicodeDecodeError:
|
|
logger.warning(f"Could not decode text from file '{fileName}' as UTF-8, trying alternative encodings")
|
|
try:
|
|
# Try alternative encodings
|
|
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
|
|
try:
|
|
textContent = fileContent.decode(encoding)
|
|
logger.info(f"Text successfully decoded with encoding {encoding}")
|
|
return [{
|
|
"sequenceNr": 1,
|
|
"name": "1_text", # Simplified naming
|
|
"ext": fileExtension,
|
|
"mimeType": "text/plain",
|
|
"data": textContent,
|
|
"base64Encoded": False,
|
|
"metadata": {
|
|
"isText": True,
|
|
"encoding": encoding
|
|
}
|
|
}]
|
|
except UnicodeDecodeError:
|
|
continue
|
|
|
|
# Fallback to binary data if no encoding works
|
|
logger.warning(f"Could not decode text, using binary data")
|
|
return [{
|
|
"sequenceNr": 1,
|
|
"name": "1_binary", # Simplified naming
|
|
"ext": fileExtension,
|
|
"mimeType": mimeType,
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False
|
|
}
|
|
}]
|
|
except Exception as e:
|
|
logger.error(f"Error in alternative text decoding: {str(e)}")
|
|
# Return binary data as fallback
|
|
return [{
|
|
"sequenceNr": 1,
|
|
"name": "1_binary", # Simplified naming
|
|
"ext": fileExtension,
|
|
"mimeType": mimeType,
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False
|
|
}
|
|
}]
|
|
|
|
def extractCsvContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts content from CSV files.
|
|
|
|
Args:
|
|
fileName: Name of the file
|
|
fileContent: Binary data of the file
|
|
|
|
Returns:
|
|
List of CSV-Content objects with base64Encoded = False
|
|
"""
|
|
try:
|
|
# Extract text content
|
|
csvContent = fileContent.decode('utf-8')
|
|
return [{
|
|
"sequenceNr": 1,
|
|
"name": "1_csv", # Simplified naming
|
|
"ext": "csv",
|
|
"mimeType": "text/csv",
|
|
"data": csvContent,
|
|
"base64Encoded": False,
|
|
"metadata": {
|
|
"isText": True,
|
|
"format": "csv"
|
|
}
|
|
}]
|
|
except UnicodeDecodeError:
|
|
logger.warning(f"Could not decode CSV from file '{fileName}' as UTF-8, trying alternative encodings")
|
|
try:
|
|
# Try alternative encodings for CSV
|
|
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
|
|
try:
|
|
csvContent = fileContent.decode(encoding)
|
|
logger.info(f"CSV successfully decoded with encoding {encoding}")
|
|
return [{
|
|
"sequenceNr": 1,
|
|
"name": "1_csv", # Simplified naming
|
|
"ext": "csv",
|
|
"mimeType": "text/csv",
|
|
"data": csvContent,
|
|
"base64Encoded": False,
|
|
"metadata": {
|
|
"isText": True,
|
|
"encoding": encoding,
|
|
"format": "csv"
|
|
}
|
|
}]
|
|
except UnicodeDecodeError:
|
|
continue
|
|
|
|
# Fallback to binary data
|
|
return [{
|
|
"sequenceNr": 1,
|
|
"name": "1_binary", # Simplified naming
|
|
"ext": "csv",
|
|
"mimeType": "text/csv",
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False
|
|
}
|
|
}]
|
|
except Exception as e:
|
|
logger.error(f"Error in alternative CSV decoding: {str(e)}")
|
|
return [{
|
|
"sequenceNr": 1,
|
|
"name": "1_binary", # Simplified naming
|
|
"ext": "csv",
|
|
"mimeType": "text/csv",
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False
|
|
}
|
|
}]
|
|
|
|
def extractSvgContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts content from SVG files.
|
|
|
|
Args:
|
|
fileName: Name of the file
|
|
fileContent: Binary data of the file
|
|
|
|
Returns:
|
|
List of SVG-Content objects with dual text/image metadata
|
|
"""
|
|
contents = []
|
|
|
|
try:
|
|
# Extract SVG as text content (XML)
|
|
svgText = fileContent.decode('utf-8')
|
|
|
|
# Check if it's actually SVG by looking for the SVG tag
|
|
if "<svg" in svgText.lower():
|
|
# SVG is both text (XML) and an image
|
|
contents.append({
|
|
"sequenceNr": 1,
|
|
"name": "1_svg", # Simplified naming
|
|
"ext": "svg",
|
|
"mimeType": "image/svg+xml",
|
|
"data": svgText,
|
|
"base64Encoded": False,
|
|
"metadata": {
|
|
"isText": True, # SVG is text-based (XML)
|
|
"format": "svg",
|
|
"isImage": True # But also represents an image
|
|
}
|
|
})
|
|
else:
|
|
# Doesn't appear to be a valid SVG file
|
|
logger.warning(f"File '{fileName}' has SVG extension but does not contain SVG markup")
|
|
contents.append({
|
|
"sequenceNr": 1,
|
|
"name": "1_text",
|
|
"ext": "svg",
|
|
"mimeType": "text/plain",
|
|
"data": svgText,
|
|
"base64Encoded": False,
|
|
"metadata": {
|
|
"isText": True,
|
|
"format": "text"
|
|
}
|
|
})
|
|
except UnicodeDecodeError:
|
|
logger.warning(f"Could not decode SVG from file '{fileName}' as UTF-8, trying alternative encodings")
|
|
try:
|
|
# Try alternative encodings
|
|
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
|
|
try:
|
|
svgText = fileContent.decode(encoding)
|
|
if "<svg" in svgText.lower():
|
|
logger.info(f"SVG successfully decoded with encoding {encoding}")
|
|
contents.append({
|
|
"sequenceNr": 1,
|
|
"name": "1_svg", # Simplified naming
|
|
"ext": "svg",
|
|
"mimeType": "image/svg+xml",
|
|
"data": svgText,
|
|
"base64Encoded": False,
|
|
"metadata": {
|
|
"isText": True,
|
|
"format": "svg",
|
|
"isImage": True,
|
|
"encoding": encoding
|
|
}
|
|
})
|
|
break
|
|
except UnicodeDecodeError:
|
|
continue
|
|
|
|
# Fallback to binary data if no encoding works
|
|
if not contents:
|
|
logger.warning(f"Could not decode SVG text, using binary data")
|
|
contents.append({
|
|
"sequenceNr": 1,
|
|
"name": "1_binary", # Simplified naming
|
|
"ext": "svg",
|
|
"mimeType": "image/svg+xml",
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False,
|
|
"format": "svg",
|
|
"isImage": True
|
|
}
|
|
})
|
|
except Exception as e:
|
|
logger.error(f"Error in alternative SVG decoding: {str(e)}")
|
|
# Return binary data as fallback
|
|
contents.append({
|
|
"sequenceNr": 1,
|
|
"name": "1_binary", # Simplified naming
|
|
"ext": "svg",
|
|
"mimeType": "image/svg+xml",
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False,
|
|
"format": "svg",
|
|
"isImage": True
|
|
}
|
|
})
|
|
|
|
return contents
|
|
|
|
def extractImageContent(fileName: str, fileContent: bytes, mimeType: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts content from image files and optionally generates metadata descriptions.
|
|
|
|
Args:
|
|
fileName: Name of the file
|
|
fileContent: Binary data of the file
|
|
mimeType: MIME type of the file
|
|
|
|
Returns:
|
|
List of Image-Content objects with base64Encoded = True
|
|
"""
|
|
|
|
# Extract file extension from MIME type or filename
|
|
fileExtension = mimeType.split('/')[-1]
|
|
if fileExtension == "jpeg":
|
|
fileExtension = "jpg"
|
|
|
|
# If possible, analyze image and extract metadata
|
|
imageMetadata = {
|
|
"isText": False,
|
|
"format": "image"
|
|
}
|
|
imageDescription = None
|
|
|
|
try:
|
|
_loadImageProcessor()
|
|
if imageProcessorLoaded and fileContent and len(fileContent) > 0:
|
|
with io.BytesIO(fileContent) as imgStream:
|
|
try:
|
|
img = Image.open(imgStream)
|
|
# Check if the image was actually loaded
|
|
img.verify()
|
|
# To safely continue working, reload
|
|
imgStream.seek(0)
|
|
img = Image.open(imgStream)
|
|
imageMetadata.update({
|
|
"format": img.format,
|
|
"mode": img.mode,
|
|
"width": img.width,
|
|
"height": img.height
|
|
})
|
|
# Extract EXIF data if available
|
|
if hasattr(img, '_getexif') and callable(img._getexif):
|
|
exif = img._getexif()
|
|
if exif:
|
|
exifData = {}
|
|
for tagId, value in exif.items():
|
|
exifData[f"tag_{tagId}"] = str(value)
|
|
imageMetadata["exif"] = exifData
|
|
|
|
# Generate image description
|
|
imageDescription = f"Image ({img.width}x{img.height}, {img.format}, {img.mode})"
|
|
except Exception as innerE:
|
|
logger.warning(f"Error processing image: {str(innerE)}")
|
|
imageMetadata["error"] = str(innerE)
|
|
imageDescription = f"Image (unable to process: {str(innerE)})"
|
|
except Exception as e:
|
|
logger.warning(f"Could not extract image metadata: {str(e)}")
|
|
imageMetadata["error"] = str(e)
|
|
|
|
# Convert binary image to base64
|
|
encoded_data = base64.b64encode(fileContent).decode('utf-8')
|
|
|
|
# Return image content
|
|
contents = [{
|
|
"sequenceNr": 1,
|
|
"name": "1_image", # Simplified naming
|
|
"ext": fileExtension,
|
|
"mimeType": mimeType,
|
|
"data": encoded_data,
|
|
"base64Encoded": True,
|
|
"metadata": imageMetadata
|
|
}]
|
|
|
|
# If image description available, add as additional text content
|
|
if imageDescription:
|
|
contents.append({
|
|
"sequenceNr": 2,
|
|
"name": "2_text_image_info", # Simplified naming with label
|
|
"ext": "txt",
|
|
"mimeType": "text/plain",
|
|
"data": imageDescription,
|
|
"base64Encoded": False,
|
|
"metadata": {
|
|
"isText": True,
|
|
"imageDescription": True
|
|
}
|
|
})
|
|
|
|
return contents
|
|
|
|
def extractPdfContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts text and images from PDF files.
|
|
|
|
Args:
|
|
fileName: Name of the file
|
|
fileContent: Binary data of the file
|
|
|
|
Returns:
|
|
List of PDF-Content objects (text and images) with appropriate base64Encoded flags
|
|
"""
|
|
contents = []
|
|
extractedContentFound = False
|
|
|
|
try:
|
|
# Load PDF extraction libraries
|
|
_loadPdfExtractor()
|
|
if not pdfExtractorLoaded:
|
|
logger.warning("PDF extraction not possible: Libraries not available")
|
|
# Add original file as binary content
|
|
contents.append({
|
|
"sequenceNr": 1,
|
|
"name": "1_pdf", # Simplified naming
|
|
"ext": "pdf",
|
|
"mimeType": "application/pdf",
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False,
|
|
"format": "pdf"
|
|
}
|
|
})
|
|
return contents
|
|
|
|
# Extract text with PyPDF2
|
|
extractedText = ""
|
|
pdfMetadata = {}
|
|
with io.BytesIO(fileContent) as pdfStream:
|
|
pdfReader = PyPDF2.PdfReader(pdfStream)
|
|
|
|
# Extract metadata
|
|
pdfInfo = pdfReader.metadata or {}
|
|
for key, value in pdfInfo.items():
|
|
if key.startswith('/'):
|
|
pdfMetadata[key[1:]] = value
|
|
else:
|
|
pdfMetadata[key] = value
|
|
|
|
# Extract text from all pages
|
|
for pageNum in range(len(pdfReader.pages)):
|
|
page = pdfReader.pages[pageNum]
|
|
pageText = page.extract_text()
|
|
if pageText:
|
|
extractedText += f"--- Page {pageNum + 1} ---\n{pageText}\n\n"
|
|
|
|
# If text was found, add as separate content
|
|
if extractedText.strip():
|
|
extractedContentFound = True
|
|
contents.append({
|
|
"sequenceNr": len(contents) + 1,
|
|
"name": f"{len(contents) + 1}_text", # Simplified naming
|
|
"ext": "txt",
|
|
"mimeType": "text/plain",
|
|
"data": extractedText,
|
|
"base64Encoded": False,
|
|
"metadata": {
|
|
"isText": True,
|
|
"source": "pdf",
|
|
"pages": len(pdfReader.pages),
|
|
"pdfMetadata": pdfMetadata
|
|
}
|
|
})
|
|
|
|
# Extract images with PyMuPDF (fitz)
|
|
try:
|
|
with io.BytesIO(fileContent) as pdfStream:
|
|
doc = fitz.open(stream=pdfStream, filetype="pdf")
|
|
imageCount = 0
|
|
|
|
for pageNum in range(len(doc)):
|
|
page = doc[pageNum]
|
|
imageList = page.get_images(full=True)
|
|
|
|
for imgIndex, imgInfo in enumerate(imageList):
|
|
try:
|
|
imageCount += 1
|
|
xref = imgInfo[0]
|
|
baseImage = doc.extract_image(xref)
|
|
imageBytes = baseImage["image"]
|
|
imageExt = baseImage["ext"]
|
|
|
|
# Add image as content - encode as base64
|
|
extractedContentFound = True
|
|
contents.append({
|
|
"sequenceNr": len(contents) + 1,
|
|
"name": f"{len(contents) + 1}_image_page{pageNum+1}_{imgIndex+1}", # Simplified naming with label
|
|
"ext": imageExt,
|
|
"mimeType": f"image/{imageExt}",
|
|
"data": base64.b64encode(imageBytes).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False,
|
|
"source": "pdf",
|
|
"page": pageNum + 1,
|
|
"index": imgIndex
|
|
}
|
|
})
|
|
except Exception as imgE:
|
|
logger.warning(f"Error extracting image {imgIndex} on page {pageNum + 1}: {str(imgE)}")
|
|
|
|
# Close document
|
|
doc.close()
|
|
|
|
except Exception as imgExtractE:
|
|
logger.warning(f"Error extracting images from PDF: {str(imgExtractE)}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in PDF extraction: {str(e)}")
|
|
|
|
# If no content was extracted, add the original PDF
|
|
if not extractedContentFound:
|
|
contents.append({
|
|
"sequenceNr": 1,
|
|
"name": "1_pdf", # Simplified naming
|
|
"ext": "pdf",
|
|
"mimeType": "application/pdf",
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False,
|
|
"format": "pdf"
|
|
}
|
|
})
|
|
|
|
return contents
|
|
|
|
def extractWordContent(fileName: str, fileContent: bytes, mimeType: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts text and images from Word documents.
|
|
|
|
Args:
|
|
fileName: Name of the file
|
|
fileContent: Binary data of the file
|
|
mimeType: MIME type of the file
|
|
|
|
Returns:
|
|
List of Word-Content objects (text and possibly images) with appropriate base64Encoded flags
|
|
"""
|
|
contents = []
|
|
extractedContentFound = False
|
|
|
|
# Determine file extension
|
|
fileExtension = "docx" if mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" else "doc"
|
|
|
|
try:
|
|
# Load Office extraction libraries
|
|
_loadOfficeExtractor()
|
|
if not officeExtractorLoaded:
|
|
logger.warning("Word extraction not possible: Libraries not available")
|
|
# Add original file as binary content
|
|
contents.append({
|
|
"sequenceNr": 1,
|
|
"name": "1_word", # Simplified naming
|
|
"ext": fileExtension,
|
|
"mimeType": mimeType,
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False,
|
|
"format": "word"
|
|
}
|
|
})
|
|
return contents
|
|
|
|
# Only supports DOCX (newer format)
|
|
if mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
|
with io.BytesIO(fileContent) as docxStream:
|
|
doc = docx.Document(docxStream)
|
|
|
|
# Extract text
|
|
fullText = []
|
|
for para in doc.paragraphs:
|
|
fullText.append(para.text)
|
|
|
|
# Extract tables
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
rowText = []
|
|
for cell in row.cells:
|
|
rowText.append(cell.text)
|
|
fullText.append(" | ".join(rowText))
|
|
|
|
extractedText = "\n\n".join(fullText)
|
|
|
|
# Add extracted text as content
|
|
if extractedText.strip():
|
|
extractedContentFound = True
|
|
contents.append({
|
|
"sequenceNr": 1,
|
|
"name": "1_text", # Simplified naming
|
|
"ext": "txt",
|
|
"mimeType": "text/plain",
|
|
"data": extractedText,
|
|
"base64Encoded": False,
|
|
"metadata": {
|
|
"isText": True,
|
|
"source": "docx",
|
|
"paragraphCount": len(doc.paragraphs),
|
|
"tableCount": len(doc.tables)
|
|
}
|
|
})
|
|
else:
|
|
logger.warning(f"Extraction from old Word format (DOC) not supported")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in Word extraction: {str(e)}")
|
|
|
|
# If no content was extracted, add the original document
|
|
if not extractedContentFound:
|
|
contents.append({
|
|
"sequenceNr": 1,
|
|
"name": "1_word", # Simplified naming
|
|
"ext": fileExtension,
|
|
"mimeType": mimeType,
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False,
|
|
"format": "word"
|
|
}
|
|
})
|
|
|
|
return contents
|
|
|
|
def extractExcelContent(fileName: str, fileContent: bytes, mimeType: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts table data from Excel files.
|
|
|
|
Args:
|
|
fileName: Name of the file
|
|
fileContent: Binary data of the file
|
|
mimeType: MIME type of the file
|
|
|
|
Returns:
|
|
List of Excel-Content objects with appropriate base64Encoded flags
|
|
"""
|
|
contents = []
|
|
extractedContentFound = False
|
|
|
|
# Determine file extension
|
|
fileExtension = "xlsx" if mimeType == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" else "xls"
|
|
|
|
try:
|
|
# Load Office extraction libraries
|
|
_loadOfficeExtractor()
|
|
if not officeExtractorLoaded:
|
|
logger.warning("Excel extraction not possible: Libraries not available")
|
|
# Add original file as binary content
|
|
contents.append({
|
|
"sequenceNr": 1,
|
|
"name": "1_excel", # Simplified naming
|
|
"ext": fileExtension,
|
|
"mimeType": mimeType,
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False,
|
|
"format": "excel"
|
|
}
|
|
})
|
|
return contents
|
|
|
|
# Only supports XLSX (newer format)
|
|
if mimeType == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
|
|
with io.BytesIO(fileContent) as xlsxStream:
|
|
workbook = openpyxl.load_workbook(xlsxStream, data_only=True)
|
|
|
|
# Extract each worksheet as separate CSV content
|
|
for sheetIndex, sheetName in enumerate(workbook.sheetnames):
|
|
sheet = workbook[sheetName]
|
|
|
|
# Format data as CSV
|
|
csvRows = []
|
|
for row in sheet.iter_rows():
|
|
csvRow = []
|
|
for cell in row:
|
|
value = cell.value
|
|
if value is None:
|
|
csvRow.append("")
|
|
else:
|
|
csvRow.append(str(value).replace('"', '""'))
|
|
csvRows.append(','.join(f'"{cell}"' for cell in csvRow))
|
|
|
|
csvContent = "\n".join(csvRows)
|
|
|
|
# Add as CSV content
|
|
if csvContent.strip():
|
|
extractedContentFound = True
|
|
sheetSafeName = sheetName.replace(" ", "_").replace("/", "_").replace("\\", "_")
|
|
contents.append({
|
|
"sequenceNr": len(contents) + 1,
|
|
"name": f"{len(contents) + 1}_csv_{sheetSafeName}", # Simplified naming with sheet label
|
|
"ext": "csv",
|
|
"mimeType": "text/csv",
|
|
"data": csvContent,
|
|
"base64Encoded": False,
|
|
"metadata": {
|
|
"isText": True,
|
|
"source": "xlsx",
|
|
"sheet": sheetName,
|
|
"format": "csv"
|
|
}
|
|
})
|
|
else:
|
|
logger.warning(f"Extraction from old Excel format (XLS) not supported")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in Excel extraction: {str(e)}")
|
|
|
|
# If no content was extracted, add the original document
|
|
if not extractedContentFound:
|
|
contents.append({
|
|
"sequenceNr": 1,
|
|
"name": "1_excel", # Simplified naming
|
|
"ext": fileExtension,
|
|
"mimeType": mimeType,
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False,
|
|
"format": "excel"
|
|
}
|
|
})
|
|
|
|
return contents
|
|
|
|
def extractPowerpointContent(fileName: str, fileContent: bytes, mimeType: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts content from PowerPoint presentations.
|
|
|
|
Args:
|
|
fileName: Name of the file
|
|
fileContent: Binary data of the file
|
|
mimeType: MIME type of the file
|
|
|
|
Returns:
|
|
List of PowerPoint-Content objects with base64Encoded = True
|
|
"""
|
|
# For PowerPoint, we currently only return the original binary file
|
|
# A complete extraction would require more specialized libraries
|
|
fileExtension = "pptx" if mimeType == "application/vnd.openxmlformats-officedocument.presentationml.presentation" else "ppt"
|
|
return [{
|
|
"sequenceNr": 1,
|
|
"name": "1_powerpoint", # Simplified naming
|
|
"ext": fileExtension,
|
|
"mimeType": mimeType,
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False,
|
|
"format": "powerpoint"
|
|
}
|
|
}]
|
|
|
|
def extractBinaryContent(fileName: str, fileContent: bytes, mimeType: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Fallback for binary files where no specific extraction is possible.
|
|
|
|
Args:
|
|
fileName: Name of the file
|
|
fileContent: Binary data of the file
|
|
mimeType: MIME type of the file
|
|
|
|
Returns:
|
|
List with a binary Content object with base64Encoded = True
|
|
"""
|
|
fileExtension = os.path.splitext(fileName)[1][1:] if os.path.splitext(fileName)[1] else "bin"
|
|
return [{
|
|
"sequenceNr": 1,
|
|
"name": "1_binary", # Simplified naming
|
|
"ext": fileExtension,
|
|
"mimeType": mimeType,
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False,
|
|
"format": "binary"
|
|
}
|
|
}]
|
|
|
|
def processFile(self, fileContent: bytes, fileName: str, fileMetadata: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
|
"""
|
|
Process a file and return its contents as a list of documents.
|
|
|
|
Args:
|
|
fileContent: Binary content of the file
|
|
fileName: Name of the file
|
|
fileMetadata: Optional metadata about the file
|
|
|
|
Returns:
|
|
List of document dictionaries
|
|
"""
|
|
try:
|
|
# Get file extension and MIME type
|
|
fileExtension = os.path.splitext(fileName)[1].lower()[1:]
|
|
mimeType = fileMetadata.get("mimeType", self.serviceBase.getMimeType(fileName)) if fileMetadata else self.serviceBase.getMimeType(fileName)
|
|
|
|
# Process based on file type
|
|
if mimeType.startswith("image/"):
|
|
return self._processImageFile(fileContent, fileName, fileExtension, mimeType, fileMetadata)
|
|
elif mimeType == "application/pdf":
|
|
return self._processPdfFile(fileContent, fileName, fileMetadata)
|
|
elif mimeType == "text/csv":
|
|
return self._processCsvFile(fileContent, fileName, fileMetadata)
|
|
elif mimeType == "text/plain":
|
|
return self._processTextFile(fileContent, fileName, fileMetadata)
|
|
else:
|
|
# Default binary file handling
|
|
return [{
|
|
"name": fileName,
|
|
"ext": fileExtension,
|
|
"mimeType": mimeType,
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False
|
|
}
|
|
}]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing file {fileName}: {str(e)}")
|
|
raise FileProcessingError(f"Error processing file: {str(e)}")
|
|
|
|
def _processImageFile(self, fileContent: bytes, fileName: str, fileExtension: str, mimeType: str, fileMetadata: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
|
"""Process an image file."""
|
|
try:
|
|
# Create image document
|
|
imageDoc = {
|
|
"name": fileName,
|
|
"ext": fileExtension,
|
|
"mimeType": mimeType,
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False,
|
|
"isImage": True,
|
|
"format": fileExtension
|
|
}
|
|
}
|
|
|
|
# Add image description if available
|
|
if fileMetadata and "description" in fileMetadata:
|
|
imageDoc["metadata"]["description"] = fileMetadata["description"]
|
|
|
|
return [imageDoc]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing image file {fileName}: {str(e)}")
|
|
raise FileProcessingError(f"Error processing image file: {str(e)}")
|
|
|
|
def _processPdfFile(self, fileContent: bytes, fileName: str, fileMetadata: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
|
"""Process a PDF file."""
|
|
try:
|
|
# Create PDF document
|
|
pdfDoc = {
|
|
"name": fileName,
|
|
"ext": "pdf",
|
|
"mimeType": "application/pdf",
|
|
"data": base64.b64encode(fileContent).decode('utf-8'),
|
|
"base64Encoded": True,
|
|
"metadata": {
|
|
"isText": False,
|
|
"isPdf": True
|
|
}
|
|
}
|
|
|
|
return [pdfDoc]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing PDF file {fileName}: {str(e)}")
|
|
raise FileProcessingError(f"Error processing PDF file: {str(e)}")
|
|
|
|
def _processCsvFile(self, fileContent: bytes, fileName: str, fileMetadata: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
|
"""Process a CSV file."""
|
|
try:
|
|
# Try to decode as text first
|
|
try:
|
|
csvContent = fileContent.decode('utf-8')
|
|
base64Encoded = False
|
|
except UnicodeDecodeError:
|
|
# If not valid UTF-8, encode as base64
|
|
csvContent = base64.b64encode(fileContent).decode('utf-8')
|
|
base64Encoded = True
|
|
|
|
# Create CSV document
|
|
csvDoc = {
|
|
"name": fileName,
|
|
"ext": "csv",
|
|
"mimeType": "text/csv",
|
|
"data": csvContent,
|
|
"base64Encoded": base64Encoded,
|
|
"metadata": {
|
|
"isText": True,
|
|
"isCsv": True,
|
|
"format": "csv"
|
|
}
|
|
}
|
|
|
|
return [csvDoc]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing CSV file {fileName}: {str(e)}")
|
|
raise FileProcessingError(f"Error processing CSV file: {str(e)}")
|
|
|
|
def _processTextFile(self, fileContent: bytes, fileName: str, fileMetadata: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
|
"""Process a text file."""
|
|
try:
|
|
# Try to decode as text
|
|
try:
|
|
textContent = fileContent.decode('utf-8')
|
|
base64Encoded = False
|
|
except UnicodeDecodeError:
|
|
# If not valid UTF-8, encode as base64
|
|
textContent = base64.b64encode(fileContent).decode('utf-8')
|
|
base64Encoded = True
|
|
|
|
# Create text document
|
|
textDoc = {
|
|
"name": fileName,
|
|
"ext": "txt",
|
|
"mimeType": "text/plain",
|
|
"data": textContent,
|
|
"base64Encoded": base64Encoded,
|
|
"metadata": {
|
|
"isText": True
|
|
}
|
|
}
|
|
|
|
return [textDoc]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing text file {fileName}: {str(e)}")
|
|
raise FileProcessingError(f"Error processing text file: {str(e)}") |