778 lines
No EOL
29 KiB
Python
778 lines
No EOL
29 KiB
Python
"""
|
|
Module for extracting content from various file formats.
|
|
Provides specialized functions for processing text, PDF, Office documents, images, etc.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import io
|
|
from typing import Dict, Any, List, Optional, Union, Tuple
|
|
import base64
|
|
|
|
# Configure logger
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Optional imports - only loaded when needed
|
|
pdf_extractor_loaded = False
|
|
office_extractor_loaded = False
|
|
image_processor_loaded = False
|
|
|
|
def get_document_contents(file_metadata: Dict[str, Any], file_content: bytes) -> List[Dict[str, Any]]:
|
|
"""
|
|
Main function for extracting content from a file based on its MIME type.
|
|
Delegates to specialized extraction functions.
|
|
|
|
Args:
|
|
file_metadata: File metadata (Name, MIME type, etc.)
|
|
file_content: Binary data of the file
|
|
|
|
Returns:
|
|
List of Document-Content objects with metadata and is_text flag
|
|
"""
|
|
try:
|
|
mime_type = file_metadata.get("mime_type", "application/octet-stream")
|
|
file_name = file_metadata.get("name", "unknown")
|
|
|
|
logger.info(f"Extracting content from file '{file_name}' (MIME type: {mime_type})")
|
|
|
|
# Extract content based on MIME type
|
|
contents = []
|
|
|
|
# Text-based formats
|
|
if mime_type.startswith("text/") or mime_type in [
|
|
"application/json",
|
|
"application/xml",
|
|
"application/javascript",
|
|
"application/x-python"
|
|
]:
|
|
contents.extend(extract_text_content(file_name, file_content, mime_type))
|
|
|
|
# CSV Format
|
|
elif mime_type == "text/csv":
|
|
contents.extend(extract_csv_content(file_name, file_content))
|
|
|
|
# Images
|
|
elif mime_type.startswith("image/"):
|
|
contents.extend(extract_image_content(file_name, file_content, mime_type))
|
|
|
|
# PDF Documents
|
|
elif mime_type == "application/pdf":
|
|
contents.extend(extract_pdf_content(file_name, file_content))
|
|
|
|
# Word Documents
|
|
elif mime_type in [
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
"application/msword"
|
|
]:
|
|
contents.extend(extract_word_content(file_name, file_content, mime_type))
|
|
|
|
# Excel Documents
|
|
elif mime_type in [
|
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
"application/vnd.ms-excel"
|
|
]:
|
|
contents.extend(extract_excel_content(file_name, file_content, mime_type))
|
|
|
|
# PowerPoint Documents
|
|
elif mime_type in [
|
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
"application/vnd.ms-powerpoint"
|
|
]:
|
|
contents.extend(extract_powerpoint_content(file_name, file_content, mime_type))
|
|
|
|
# Binary data as fallback for unknown formats
|
|
else:
|
|
contents.extend(extract_binary_content(file_name, file_content, mime_type))
|
|
|
|
# Fallback when no content could be extracted
|
|
if not contents:
|
|
logger.warning(f"No content extracted from file '{file_name}', using binary fallback")
|
|
contents.append({
|
|
"sequence_nr": 1,
|
|
"name": '1_undefined',
|
|
"ext": os.path.splitext(file_name)[1][1:] if os.path.splitext(file_name)[1] else "bin",
|
|
"content_type": mime_type,
|
|
"data": file_content,
|
|
"metadata": {
|
|
"is_text": False
|
|
}
|
|
})
|
|
|
|
# Add generic attributes for all documents
|
|
for content in contents:
|
|
if isinstance(content.get("data"), bytes):
|
|
content["data"] = base64.b64encode(content["data"]).decode('utf-8')
|
|
# Add base64 flag
|
|
if "metadata" not in content:
|
|
content["metadata"] = {}
|
|
content["metadata"]["base64_encoded"] = True
|
|
|
|
logger.info(f"Successfully extracted {len(contents)} content items from file '{file_name}'")
|
|
return contents
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during content extraction: {str(e)}")
|
|
# Fallback on error - return original data
|
|
return [{
|
|
"sequence_nr": 1,
|
|
"name": file_metadata.get("name", "unknown"),
|
|
"ext": os.path.splitext(file_metadata.get("name", ""))[1][1:] if os.path.splitext(file_metadata.get("name", ""))[1] else "bin",
|
|
"content_type": file_metadata.get("mime_type", "application/octet-stream"),
|
|
"data": file_content,
|
|
"metadata": {
|
|
"is_text": False
|
|
}
|
|
}]
|
|
|
|
|
|
def _load_pdf_extractor():
|
|
"""Loads PDF extraction libraries when needed"""
|
|
global pdf_extractor_loaded
|
|
if not pdf_extractor_loaded:
|
|
try:
|
|
global PyPDF2, fitz
|
|
import PyPDF2
|
|
import fitz # PyMuPDF for more extensive PDF processing
|
|
pdf_extractor_loaded = True
|
|
logger.info("PDF extraction libraries successfully loaded")
|
|
except ImportError as e:
|
|
logger.warning(f"PDF extraction libraries could not be loaded: {e}")
|
|
|
|
def _load_office_extractor():
|
|
"""Loads Office document extraction libraries when needed"""
|
|
global office_extractor_loaded
|
|
if not office_extractor_loaded:
|
|
try:
|
|
global docx, openpyxl
|
|
import docx # python-docx for Word documents
|
|
import openpyxl # for Excel files
|
|
office_extractor_loaded = True
|
|
logger.info("Office extraction libraries successfully loaded")
|
|
except ImportError as e:
|
|
logger.warning(f"Office extraction libraries could not be loaded: {e}")
|
|
|
|
def _load_image_processor():
|
|
"""Loads image processing libraries when needed"""
|
|
global image_processor_loaded
|
|
if not image_processor_loaded:
|
|
try:
|
|
global PIL, Image
|
|
from PIL import Image
|
|
image_processor_loaded = True
|
|
logger.info("Image processing libraries successfully loaded")
|
|
except ImportError as e:
|
|
logger.warning(f"Image processing libraries could not be loaded: {e}")
|
|
|
|
def extract_text_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts text from text files.
|
|
|
|
Args:
|
|
file_name: Name of the file
|
|
file_content: Binary data of the file
|
|
mime_type: MIME type of the file
|
|
|
|
Returns:
|
|
List of Text-Content objects with metadata.is_text = True
|
|
"""
|
|
try:
|
|
# Keep original file extension
|
|
file_extension = os.path.splitext(file_name)[1][1:] if os.path.splitext(file_name)[1] else "txt"
|
|
|
|
# Extract text content
|
|
text_content = file_content.decode('utf-8')
|
|
return [{
|
|
"sequence_nr": 1,
|
|
"name": "1_text", # Simplified naming
|
|
"ext": file_extension,
|
|
"content_type": "text",
|
|
"data": text_content,
|
|
"metadata": {
|
|
"is_text": True
|
|
}
|
|
}]
|
|
except UnicodeDecodeError:
|
|
logger.warning(f"Could not decode text from file '{file_name}' as UTF-8, trying alternative encodings")
|
|
try:
|
|
# Try alternative encodings
|
|
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
|
|
try:
|
|
text_content = file_content.decode(encoding)
|
|
logger.info(f"Text successfully decoded with encoding {encoding}")
|
|
return [{
|
|
"sequence_nr": 1,
|
|
"name": "1_text", # Simplified naming
|
|
"ext": file_extension,
|
|
"content_type": "text",
|
|
"data": text_content,
|
|
"metadata": {
|
|
"is_text": True,
|
|
"encoding": encoding
|
|
}
|
|
}]
|
|
except UnicodeDecodeError:
|
|
continue
|
|
|
|
# Fallback to binary data if no encoding works
|
|
logger.warning(f"Could not decode text, using binary data")
|
|
return [{
|
|
"sequence_nr": 1,
|
|
"name": "1_binary", # Simplified naming
|
|
"ext": file_extension,
|
|
"content_type": mime_type,
|
|
"data": file_content,
|
|
"metadata": {
|
|
"is_text": False
|
|
}
|
|
}]
|
|
except Exception as e:
|
|
logger.error(f"Error in alternative text decoding: {str(e)}")
|
|
# Return binary data as fallback
|
|
return [{
|
|
"sequence_nr": 1,
|
|
"name": "1_binary", # Simplified naming
|
|
"ext": file_extension,
|
|
"content_type": mime_type,
|
|
"data": file_content,
|
|
"metadata": {
|
|
"is_text": False
|
|
}
|
|
}]
|
|
|
|
def extract_csv_content(file_name: str, file_content: bytes) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts content from CSV files.
|
|
|
|
Args:
|
|
file_name: Name of the file
|
|
file_content: Binary data of the file
|
|
|
|
Returns:
|
|
List of CSV-Content objects with metadata.is_text = True
|
|
"""
|
|
try:
|
|
# Extract text content
|
|
csv_content = file_content.decode('utf-8')
|
|
return [{
|
|
"sequence_nr": 1,
|
|
"name": "1_csv", # Simplified naming
|
|
"ext": "csv",
|
|
"content_type": "csv",
|
|
"data": csv_content,
|
|
"metadata": {
|
|
"is_text": True,
|
|
"format": "csv"
|
|
}
|
|
}]
|
|
except UnicodeDecodeError:
|
|
logger.warning(f"Could not decode CSV from file '{file_name}' as UTF-8, trying alternative encodings")
|
|
try:
|
|
# Try alternative encodings for CSV
|
|
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
|
|
try:
|
|
csv_content = file_content.decode(encoding)
|
|
logger.info(f"CSV successfully decoded with encoding {encoding}")
|
|
return [{
|
|
"sequence_nr": 1,
|
|
"name": "1_csv", # Simplified naming
|
|
"ext": "csv",
|
|
"content_type": "csv",
|
|
"data": csv_content,
|
|
"metadata": {
|
|
"is_text": True,
|
|
"encoding": encoding,
|
|
"format": "csv"
|
|
}
|
|
}]
|
|
except UnicodeDecodeError:
|
|
continue
|
|
|
|
# Fallback to binary data
|
|
return [{
|
|
"sequence_nr": 1,
|
|
"name": "1_binary", # Simplified naming
|
|
"ext": "csv",
|
|
"content_type": "text/csv",
|
|
"data": file_content,
|
|
"metadata": {
|
|
"is_text": False
|
|
}
|
|
}]
|
|
except Exception as e:
|
|
logger.error(f"Error in alternative CSV decoding: {str(e)}")
|
|
return [{
|
|
"sequence_nr": 1,
|
|
"name": "1_binary", # Simplified naming
|
|
"ext": "csv",
|
|
"content_type": "text/csv",
|
|
"data": file_content,
|
|
"metadata": {
|
|
"is_text": False
|
|
}
|
|
}]
|
|
|
|
def extract_image_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts content from image files and optionally generates metadata descriptions.
|
|
|
|
Args:
|
|
file_name: Name of the file
|
|
file_content: Binary data of the file
|
|
mime_type: MIME type of the file
|
|
|
|
Returns:
|
|
List of Image-Content objects with metadata.is_text = False
|
|
"""
|
|
|
|
# Extract file extension from MIME type or filename
|
|
file_extension = mime_type.split('/')[-1]
|
|
if file_extension == "jpeg":
|
|
file_extension = "jpg"
|
|
|
|
# If possible, analyze image and extract metadata
|
|
image_metadata = {
|
|
"is_text": False,
|
|
"format": "image"
|
|
}
|
|
image_description = None
|
|
|
|
try:
|
|
_load_image_processor()
|
|
if image_processor_loaded and file_content and len(file_content) > 0:
|
|
with io.BytesIO(file_content) as img_stream:
|
|
try:
|
|
img = Image.open(img_stream)
|
|
# Check if the image was actually loaded
|
|
img.verify()
|
|
# To safely continue working, reload
|
|
img_stream.seek(0)
|
|
img = Image.open(img_stream)
|
|
image_metadata.update({
|
|
"format": img.format,
|
|
"mode": img.mode,
|
|
"width": img.width,
|
|
"height": img.height
|
|
})
|
|
# Extract EXIF data if available
|
|
if hasattr(img, '_getexif') and callable(img._getexif):
|
|
exif = img._getexif()
|
|
if exif:
|
|
exif_data = {}
|
|
for tag_id, value in exif.items():
|
|
exif_data[f"tag_{tag_id}"] = str(value)
|
|
image_metadata["exif"] = exif_data
|
|
|
|
# Generate image description
|
|
image_description = f"Image ({img.width}x{img.height}, {img.format}, {img.mode})"
|
|
except Exception as inner_e:
|
|
logger.warning(f"Error processing image: {str(inner_e)}")
|
|
image_metadata["error"] = str(inner_e)
|
|
image_description = f"Image (unable to process: {str(inner_e)})"
|
|
except Exception as e:
|
|
logger.warning(f"Could not extract image metadata: {str(e)}")
|
|
image_metadata["error"] = str(e)
|
|
|
|
|
|
# Return image content
|
|
contents = [{
|
|
"sequence_nr": 1,
|
|
"name": "1_image", # Simplified naming
|
|
"ext": file_extension,
|
|
"content_type": "image",
|
|
"data": file_content,
|
|
"metadata": image_metadata
|
|
}]
|
|
|
|
# If image description available, add as additional text content
|
|
if image_description:
|
|
contents.append({
|
|
"sequence_nr": 2,
|
|
"name": "2_text_image_info", # Simplified naming with label
|
|
"ext": "txt",
|
|
"content_type": "text",
|
|
"data": image_description,
|
|
"metadata": {
|
|
"is_text": True,
|
|
"image_description": True
|
|
}
|
|
})
|
|
|
|
return contents
|
|
|
|
def extract_pdf_content(file_name: str, file_content: bytes) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts text and images from PDF files.
|
|
|
|
Args:
|
|
file_name: Name of the file
|
|
file_content: Binary data of the file
|
|
|
|
Returns:
|
|
List of PDF-Content objects (text and images) with metadata.is_text flag
|
|
"""
|
|
contents = []
|
|
extracted_content_found = False
|
|
|
|
try:
|
|
# Load PDF extraction libraries
|
|
_load_pdf_extractor()
|
|
if not pdf_extractor_loaded:
|
|
logger.warning("PDF extraction not possible: Libraries not available")
|
|
# Add original file as binary content
|
|
contents.append({
|
|
"sequence_nr": 1,
|
|
"name": "1_pdf", # Simplified naming
|
|
"ext": "pdf",
|
|
"content_type": "application/pdf",
|
|
"data": file_content,
|
|
"metadata": {
|
|
"is_text": False,
|
|
"format": "pdf"
|
|
}
|
|
})
|
|
return contents
|
|
|
|
# Extract text with PyPDF2
|
|
extracted_text = ""
|
|
pdf_metadata = {}
|
|
with io.BytesIO(file_content) as pdf_stream:
|
|
pdf_reader = PyPDF2.PdfReader(pdf_stream)
|
|
|
|
# Extract metadata
|
|
pdf_info = pdf_reader.metadata or {}
|
|
for key, value in pdf_info.items():
|
|
if key.startswith('/'):
|
|
pdf_metadata[key[1:]] = value
|
|
else:
|
|
pdf_metadata[key] = value
|
|
|
|
# Extract text from all pages
|
|
for page_num in range(len(pdf_reader.pages)):
|
|
page = pdf_reader.pages[page_num]
|
|
page_text = page.extract_text()
|
|
if page_text:
|
|
extracted_text += f"--- Page {page_num + 1} ---\n{page_text}\n\n"
|
|
|
|
# If text was found, add as separate content
|
|
if extracted_text.strip():
|
|
extracted_content_found = True
|
|
contents.append({
|
|
"sequence_nr": len(contents) + 1,
|
|
"name": f"{len(contents) + 1}_text", # Simplified naming
|
|
"ext": "txt",
|
|
"content_type": "text",
|
|
"data": extracted_text,
|
|
"metadata": {
|
|
"is_text": True,
|
|
"source": "pdf",
|
|
"pages": len(pdf_reader.pages),
|
|
"pdf_metadata": pdf_metadata
|
|
}
|
|
})
|
|
|
|
# Extract images with PyMuPDF (fitz)
|
|
try:
|
|
with io.BytesIO(file_content) as pdf_stream:
|
|
doc = fitz.open(stream=pdf_stream, filetype="pdf")
|
|
image_count = 0
|
|
|
|
for page_num in range(len(doc)):
|
|
page = doc[page_num]
|
|
image_list = page.get_images(full=True)
|
|
|
|
for img_index, img_info in enumerate(image_list):
|
|
try:
|
|
image_count += 1
|
|
xref = img_info[0]
|
|
base_image = doc.extract_image(xref)
|
|
image_bytes = base_image["image"]
|
|
image_ext = base_image["ext"]
|
|
|
|
# Add image as content
|
|
extracted_content_found = True
|
|
contents.append({
|
|
"sequence_nr": len(contents) + 1,
|
|
"name": f"{len(contents) + 1}_image_page{page_num+1}_{img_index+1}", # Simplified naming with label
|
|
"ext": image_ext,
|
|
"content_type": f"image/{image_ext}",
|
|
"data": image_bytes,
|
|
"metadata": {
|
|
"is_text": False,
|
|
"source": "pdf",
|
|
"page": page_num + 1,
|
|
"index": img_index
|
|
}
|
|
})
|
|
except Exception as img_e:
|
|
logger.warning(f"Error extracting image {img_index} on page {page_num + 1}: {str(img_e)}")
|
|
|
|
# Close document
|
|
doc.close()
|
|
|
|
except Exception as img_extract_e:
|
|
logger.warning(f"Error extracting images from PDF: {str(img_extract_e)}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in PDF extraction: {str(e)}")
|
|
|
|
# If no content was extracted, add the original PDF
|
|
if not extracted_content_found:
|
|
contents.append({
|
|
"sequence_nr": 1,
|
|
"name": "1_pdf", # Simplified naming
|
|
"ext": "pdf",
|
|
"content_type": "application/pdf",
|
|
"data": file_content,
|
|
"metadata": {
|
|
"is_text": False,
|
|
"format": "pdf"
|
|
}
|
|
})
|
|
|
|
return contents
|
|
|
|
def extract_word_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts text and images from Word documents.
|
|
|
|
Args:
|
|
file_name: Name of the file
|
|
file_content: Binary data of the file
|
|
mime_type: MIME type of the file
|
|
|
|
Returns:
|
|
List of Word-Content objects (text and possibly images) with metadata.is_text flag
|
|
"""
|
|
contents = []
|
|
extracted_content_found = False
|
|
|
|
# Determine file extension
|
|
file_extension = "docx" if mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" else "doc"
|
|
|
|
try:
|
|
# Load Office extraction libraries
|
|
_load_office_extractor()
|
|
if not office_extractor_loaded:
|
|
logger.warning("Word extraction not possible: Libraries not available")
|
|
# Add original file as binary content
|
|
contents.append({
|
|
"sequence_nr": 1,
|
|
"name": "1_word", # Simplified naming
|
|
"ext": file_extension,
|
|
"content_type": mime_type,
|
|
"data": file_content,
|
|
"metadata": {
|
|
"is_text": False,
|
|
"format": "word"
|
|
}
|
|
})
|
|
return contents
|
|
|
|
# Only supports DOCX (newer format)
|
|
if mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
|
with io.BytesIO(file_content) as docx_stream:
|
|
doc = docx.Document(docx_stream)
|
|
|
|
# Extract text
|
|
full_text = []
|
|
for para in doc.paragraphs:
|
|
full_text.append(para.text)
|
|
|
|
# Extract tables
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
row_text = []
|
|
for cell in row.cells:
|
|
row_text.append(cell.text)
|
|
full_text.append(" | ".join(row_text))
|
|
|
|
extracted_text = "\n\n".join(full_text)
|
|
|
|
# Add extracted text as content
|
|
if extracted_text.strip():
|
|
extracted_content_found = True
|
|
contents.append({
|
|
"sequence_nr": 1,
|
|
"name": "1_text", # Simplified naming
|
|
"ext": "txt",
|
|
"content_type": "text",
|
|
"data": extracted_text,
|
|
"metadata": {
|
|
"is_text": True,
|
|
"source": "docx",
|
|
"paragraph_count": len(doc.paragraphs),
|
|
"table_count": len(doc.tables)
|
|
}
|
|
})
|
|
else:
|
|
logger.warning(f"Extraction from old Word format (DOC) not supported")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in Word extraction: {str(e)}")
|
|
|
|
# If no content was extracted, add the original document
|
|
if not extracted_content_found:
|
|
contents.append({
|
|
"sequence_nr": 1,
|
|
"name": "1_word", # Simplified naming
|
|
"ext": file_extension,
|
|
"content_type": mime_type,
|
|
"data": file_content,
|
|
"metadata": {
|
|
"is_text": False,
|
|
"format": "word"
|
|
}
|
|
})
|
|
|
|
return contents
|
|
|
|
def extract_excel_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts table data from Excel files.
|
|
|
|
Args:
|
|
file_name: Name of the file
|
|
file_content: Binary data of the file
|
|
mime_type: MIME type of the file
|
|
|
|
Returns:
|
|
List of Excel-Content objects with metadata.is_text flag
|
|
"""
|
|
contents = []
|
|
extracted_content_found = False
|
|
|
|
# Determine file extension
|
|
file_extension = "xlsx" if mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" else "xls"
|
|
|
|
try:
|
|
# Load Office extraction libraries
|
|
_load_office_extractor()
|
|
if not office_extractor_loaded:
|
|
logger.warning("Excel extraction not possible: Libraries not available")
|
|
# Add original file as binary content
|
|
contents.append({
|
|
"sequence_nr": 1,
|
|
"name": "1_excel", # Simplified naming
|
|
"ext": file_extension,
|
|
"content_type": mime_type,
|
|
"data": file_content,
|
|
"metadata": {
|
|
"is_text": False,
|
|
"format": "excel"
|
|
}
|
|
})
|
|
return contents
|
|
|
|
# Only supports XLSX (newer format)
|
|
if mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
|
|
with io.BytesIO(file_content) as xlsx_stream:
|
|
workbook = openpyxl.load_workbook(xlsx_stream, data_only=True)
|
|
|
|
# Extract each worksheet as separate CSV content
|
|
for sheet_index, sheet_name in enumerate(workbook.sheetnames):
|
|
sheet = workbook[sheet_name]
|
|
|
|
# Format data as CSV
|
|
csv_rows = []
|
|
for row in sheet.iter_rows():
|
|
csv_row = []
|
|
for cell in row:
|
|
value = cell.value
|
|
if value is None:
|
|
csv_row.append("")
|
|
else:
|
|
csv_row.append(str(value).replace('"', '""'))
|
|
csv_rows.append(','.join(f'"{cell}"' for cell in csv_row))
|
|
|
|
csv_content = "\n".join(csv_rows)
|
|
|
|
# Add as CSV content
|
|
if csv_content.strip():
|
|
extracted_content_found = True
|
|
sheet_safe_name = sheet_name.replace(" ", "_").replace("/", "_").replace("\\", "_")
|
|
contents.append({
|
|
"sequence_nr": len(contents) + 1,
|
|
"name": f"{len(contents) + 1}_csv_{sheet_safe_name}", # Simplified naming with sheet label
|
|
"ext": "csv",
|
|
"content_type": "csv",
|
|
"data": csv_content,
|
|
"metadata": {
|
|
"is_text": True,
|
|
"source": "xlsx",
|
|
"sheet": sheet_name,
|
|
"format": "csv"
|
|
}
|
|
})
|
|
else:
|
|
logger.warning(f"Extraction from old Excel format (XLS) not supported")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in Excel extraction: {str(e)}")
|
|
|
|
# If no content was extracted, add the original document
|
|
if not extracted_content_found:
|
|
contents.append({
|
|
"sequence_nr": 1,
|
|
"name": "1_excel", # Simplified naming
|
|
"ext": file_extension,
|
|
"content_type": mime_type,
|
|
"data": file_content,
|
|
"metadata": {
|
|
"is_text": False,
|
|
"format": "excel"
|
|
}
|
|
})
|
|
|
|
return contents
|
|
|
|
def extract_powerpoint_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extracts content from PowerPoint presentations.
|
|
|
|
Args:
|
|
file_name: Name of the file
|
|
file_content: Binary data of the file
|
|
mime_type: MIME type of the file
|
|
|
|
Returns:
|
|
List of PowerPoint-Content objects with metadata.is_text = False
|
|
"""
|
|
# For PowerPoint, we currently only return the original binary file
|
|
# A complete extraction would require more specialized libraries
|
|
file_extension = "pptx" if mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation" else "ppt"
|
|
return [{
|
|
"sequence_nr": 1,
|
|
"name": "1_powerpoint", # Simplified naming
|
|
"ext": file_extension,
|
|
"content_type": mime_type,
|
|
"data": file_content,
|
|
"metadata": {
|
|
"is_text": False,
|
|
"format": "powerpoint"
|
|
}
|
|
}]
|
|
|
|
def extract_binary_content(file_name: str, file_content: bytes, mime_type: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Fallback for binary files where no specific extraction is possible.
|
|
|
|
Args:
|
|
file_name: Name of the file
|
|
file_content: Binary data of the file
|
|
mime_type: MIME type of the file
|
|
|
|
Returns:
|
|
List with a binary Content object with metadata.is_text = False
|
|
"""
|
|
file_extension = os.path.splitext(file_name)[1][1:] if os.path.splitext(file_name)[1] else "bin"
|
|
return [{
|
|
"sequence_nr": 1,
|
|
"name": "1_binary", # Simplified naming
|
|
"ext": file_extension,
|
|
"content_type": mime_type,
|
|
"data": file_content,
|
|
"metadata": {
|
|
"is_text": False,
|
|
"format": "binary"
|
|
}
|
|
}] |