gateway/modules/routes/routeDataFiles.py
2026-03-15 23:38:21 +01:00

554 lines
21 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
from fastapi import APIRouter, HTTPException, Depends, File, UploadFile, Form, Path, Request, status, Query, Response, Body
from fastapi.responses import JSONResponse
from typing import List, Dict, Any, Optional
import logging
import json
# Import auth module
from modules.auth import limiter, getCurrentUser, getRequestContext, RequestContext
# Import interfaces
import modules.interfaces.interfaceDbManagement as interfaceDbManagement
from modules.datamodels.datamodelFiles import FileItem, FilePreview
from modules.shared.attributeUtils import getModelAttributeDefinitions
from modules.datamodels.datamodelUam import User
from modules.datamodels.datamodelPagination import PaginationParams, PaginatedResponse, PaginationMetadata, normalize_pagination_dict
# Configure logger
logger = logging.getLogger(__name__)
async def _autoIndexFile(fileId: str, fileName: str, mimeType: str, user):
"""Background task: pre-scan + extraction + knowledge indexing.
Step 1: Structure Pre-Scan (AI-free) -> FileContentIndex (persisted)
Step 2: Content extraction via runExtraction -> ContentParts
Step 3: KnowledgeService.indexFile -> chunking + embedding -> Knowledge Store"""
userId = user.id if hasattr(user, "id") else str(user)
try:
mgmtInterface = interfaceDbManagement.getInterface(user)
mgmtInterface.updateFile(fileId, {"status": "processing"})
rawBytes = mgmtInterface.getFileData(fileId)
if not rawBytes:
logger.warning(f"Auto-index: no file data for {fileId}, skipping")
mgmtInterface.updateFile(fileId, {"status": "active"})
return
logger.info(f"Auto-index starting for {fileName} ({len(rawBytes)} bytes, {mimeType})")
# Step 1: Structure Pre-Scan (AI-free)
from modules.serviceCenter.services.serviceKnowledge.subPreScan import preScanDocument
contentIndex = await preScanDocument(
fileData=rawBytes,
mimeType=mimeType,
fileId=fileId,
fileName=fileName,
userId=userId,
)
logger.info(
f"Pre-scan complete for {fileName}: "
f"{contentIndex.totalObjects} objects"
)
# Persist FileContentIndex immediately
from modules.interfaces.interfaceDbKnowledge import getInterface as getKnowledgeInterface
knowledgeDb = getKnowledgeInterface()
knowledgeDb.upsertFileContentIndex(contentIndex)
# Step 2: Content extraction (AI-free, produces ContentParts)
from modules.serviceCenter.services.serviceExtraction.subRegistry import ExtractorRegistry, ChunkerRegistry
from modules.serviceCenter.services.serviceExtraction.subPipeline import runExtraction
from modules.datamodels.datamodelExtraction import ExtractionOptions
extractorRegistry = ExtractorRegistry()
chunkerRegistry = ChunkerRegistry()
options = ExtractionOptions()
extracted = runExtraction(
extractorRegistry, chunkerRegistry,
rawBytes, fileName, mimeType, options,
)
contentObjects = []
for part in extracted.parts:
contentType = "text"
if part.typeGroup == "image":
contentType = "image"
elif part.typeGroup in ("binary", "container"):
contentType = "other"
if not part.data or not part.data.strip():
continue
contentObjects.append({
"contentObjectId": part.id,
"contentType": contentType,
"data": part.data,
"contextRef": {
"containerPath": fileName,
"location": part.label or "file",
**(part.metadata or {}),
},
})
logger.info(f"Extracted {len(contentObjects)} content objects from {fileName}")
if not contentObjects:
knowledgeDb.updateFileStatus(fileId, "indexed")
mgmtInterface.updateFile(fileId, {"status": "active"})
return
# Step 3: Knowledge indexing (chunking + embedding)
from modules.serviceCenter import getService
from modules.serviceCenter.context import ServiceCenterContext
ctx = ServiceCenterContext(user=user, mandate_id="", feature_instance_id="")
knowledgeService = getService("knowledge", ctx)
await knowledgeService.indexFile(
fileId=fileId,
fileName=fileName,
mimeType=mimeType,
userId=userId,
contentObjects=contentObjects,
structure=contentIndex.structure,
)
mgmtInterface.updateFile(fileId, {"status": "active"})
logger.info(f"Auto-index complete for file {fileId} ({fileName})")
except Exception as e:
logger.error(f"Auto-index failed for file {fileId}: {e}", exc_info=True)
try:
errMgmt = interfaceDbManagement.getInterface(user)
errMgmt.updateFile(fileId, {"status": "active"})
except Exception:
pass
# Model attributes for FileItem
fileAttributes = getModelAttributeDefinitions(FileItem)
# Create router for file endpoints
router = APIRouter(
prefix="/api/files",
tags=["Manage Files"],
responses={
404: {"description": "Not found"},
400: {"description": "Bad request"},
401: {"description": "Unauthorized"},
403: {"description": "Forbidden"},
500: {"description": "Internal server error"}
}
)
@router.get("/list", response_model=PaginatedResponse[FileItem])
@limiter.limit("30/minute")
def get_files(
request: Request,
pagination: Optional[str] = Query(None, description="JSON-encoded PaginationParams object"),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> PaginatedResponse[FileItem]:
"""
Get files with optional pagination, sorting, and filtering.
Query Parameters:
- pagination: JSON-encoded PaginationParams object, or None for no pagination
Examples:
- GET /api/files/list (no pagination - returns all items)
- GET /api/files/list?pagination={"page":1,"pageSize":10,"sort":[]}
- GET /api/files/list?pagination={"page":2,"pageSize":20,"sort":[{"field":"fileName","direction":"asc"}]}
"""
try:
# Parse pagination parameter
paginationParams = None
if pagination:
try:
paginationDict = json.loads(pagination)
if paginationDict:
# Normalize pagination dict (handles top-level "search" field)
paginationDict = normalize_pagination_dict(paginationDict)
paginationParams = PaginationParams(**paginationDict)
except (json.JSONDecodeError, ValueError) as e:
raise HTTPException(
status_code=400,
detail=f"Invalid pagination parameter: {str(e)}"
)
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
result = managementInterface.getAllFiles(pagination=paginationParams)
# If pagination was requested, result is PaginatedResult
# If no pagination, result is List[FileItem]
if paginationParams:
return PaginatedResponse(
items=result.items,
pagination=PaginationMetadata(
currentPage=paginationParams.page,
pageSize=paginationParams.pageSize,
totalItems=result.totalItems,
totalPages=result.totalPages,
sort=paginationParams.sort,
filters=paginationParams.filters
)
)
else:
return PaginatedResponse(
items=result,
pagination=None
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting files: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to get files: {str(e)}"
)
@router.post("/upload", status_code=status.HTTP_201_CREATED)
@limiter.limit("10/minute")
async def upload_file(
request: Request,
file: UploadFile = File(...),
workflowId: Optional[str] = Form(None),
currentUser: User = Depends(getCurrentUser)
) -> JSONResponse:
# Add fileName property to UploadFile for consistency with backend model
file.fileName = file.filename
"""Upload a file"""
try:
managementInterface = interfaceDbManagement.getInterface(currentUser)
# Read file
fileContent = await file.read()
# Check size limits
maxSize = int(interfaceDbManagement.APP_CONFIG.get("File_Management_MAX_UPLOAD_SIZE_MB")) * 1024 * 1024 # in bytes
if len(fileContent) > maxSize:
raise HTTPException(
status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
detail=f"File too large. Maximum size: {interfaceDbManagement.APP_CONFIG.get('File_Management_MAX_UPLOAD_SIZE_MB')}MB"
)
# Save file via LucyDOM interface in the database
fileItem, duplicateType = managementInterface.saveUploadedFile(fileContent, file.filename)
# Determine response message based on duplicate type
if duplicateType == "exact_duplicate":
message = f"File '{file.filename}' already exists with identical content. Reusing existing file."
elif duplicateType == "name_conflict":
message = f"File '{file.filename}' already exists with different content. Uploaded as '{fileItem.fileName}'."
else: # new_file
message = "File uploaded successfully"
# Convert FileItem to dictionary for JSON response
fileMeta = fileItem.model_dump()
# If workflowId is provided, include it in the response (not stored in FileItem model)
if workflowId:
fileMeta["workflowId"] = workflowId
# Trigger background auto-index pipeline (non-blocking)
# Also runs for duplicates in case the original was never successfully indexed
shouldIndex = duplicateType == "new_file"
if not shouldIndex:
try:
from modules.interfaces.interfaceDbKnowledge import getInterface as _getKnowledgeInterface
_kDb = _getKnowledgeInterface()
_existingIndex = _kDb.getFileContentIndex(fileItem.id)
if not _existingIndex:
shouldIndex = True
logger.info(f"Re-triggering auto-index for duplicate {fileItem.id} (not yet indexed)")
except Exception:
shouldIndex = True
if shouldIndex:
try:
import asyncio
asyncio.ensure_future(_autoIndexFile(
fileId=fileItem.id,
fileName=fileItem.fileName,
mimeType=fileItem.mimeType,
user=currentUser,
))
except Exception as indexErr:
logger.warning(f"Auto-index trigger failed (non-blocking): {indexErr}")
# Response with duplicate information
return JSONResponse({
"message": message,
"file": fileMeta,
"duplicateType": duplicateType,
"originalFileName": file.filename,
"storedFileName": fileItem.fileName,
"isDuplicate": duplicateType != "new_file"
})
except interfaceDbManagement.FileStorageError as e:
logger.error(f"Error during file upload (storage): {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
except Exception as e:
logger.error(f"Error during file upload: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error during file upload: {str(e)}"
)
@router.get("/{fileId}", response_model=FileItem)
@limiter.limit("30/minute")
def get_file(
request: Request,
fileId: str = Path(..., description="ID of the file"),
currentUser: User = Depends(getCurrentUser)
) -> FileItem:
"""Get a file"""
try:
managementInterface = interfaceDbManagement.getInterface(currentUser)
# Get file via LucyDOM interface from the database
fileData = managementInterface.getFile(fileId)
if not fileData:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"File with ID {fileId} not found"
)
return fileData
except interfaceDbManagement.FileNotFoundError as e:
logger.warning(f"File not found: {str(e)}")
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=str(e)
)
except interfaceDbManagement.FilePermissionError as e:
logger.warning(f"No permission for file: {str(e)}")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail=str(e)
)
except interfaceDbManagement.FileError as e:
logger.error(f"Error retrieving file: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
except Exception as e:
logger.error(f"Unexpected error retrieving file: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error retrieving file: {str(e)}"
)
@router.put("/{fileId}", response_model=FileItem)
@limiter.limit("10/minute")
def update_file(
request: Request,
fileId: str = Path(..., description="ID of the file to update"),
file_info: Dict[str, Any] = Body(...),
currentUser: User = Depends(getCurrentUser)
) -> FileItem:
"""Update file info"""
try:
managementInterface = interfaceDbManagement.getInterface(currentUser)
# Get the file from the database
file = managementInterface.getFile(fileId)
if not file:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"File with ID {fileId} not found"
)
# Check if user has access to the file using RBAC
if not managementInterface.checkRbacPermission(FileItem, "update", fileId):
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Not authorized to update this file"
)
# Update the file
result = managementInterface.updateFile(fileId, file_info)
if not result:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to update file"
)
# Get updated file
updatedFile = managementInterface.getFile(fileId)
return updatedFile
except HTTPException as he:
raise he
except Exception as e:
logger.error(f"Error updating file: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
@router.delete("/{fileId}", response_model=Dict[str, Any])
@limiter.limit("10/minute")
def delete_file(
request: Request,
fileId: str = Path(..., description="ID of the file to delete"),
currentUser: User = Depends(getCurrentUser)
) -> Dict[str, Any]:
"""Delete a file"""
managementInterface = interfaceDbManagement.getInterface(currentUser)
# Check if the file exists
existingFile = managementInterface.getFile(fileId)
if not existingFile:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"File with ID {fileId} not found"
)
success = managementInterface.deleteFile(fileId)
if not success:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Error deleting the file"
)
return {"message": f"File with ID {fileId} successfully deleted"}
@router.get("/stats", response_model=Dict[str, Any])
@limiter.limit("30/minute")
def get_file_stats(
request: Request,
currentUser: User = Depends(getCurrentUser)
) -> Dict[str, Any]:
"""Returns statistics about the stored files"""
try:
managementInterface = interfaceDbManagement.getInterface(currentUser)
# Get all files - metadata only
allFiles = managementInterface.getAllFiles()
# Calculate statistics
totalFiles = len(allFiles)
totalSize = sum(file.fileSize for file in allFiles)
# Group by file type
fileTypes = {}
for file in allFiles:
fileType = file.mimeType.split("/")[0]
if fileType not in fileTypes:
fileTypes[fileType] = 0
fileTypes[fileType] += 1
return {
"totalFiles": totalFiles,
"totalSizeBytes": totalSize,
"fileTypes": fileTypes
}
except Exception as e:
logger.error(f"Error retrieving file statistics: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error retrieving file statistics: {str(e)}"
)
@router.get("/{fileId}/download")
@limiter.limit("30/minute")
def download_file(
request: Request,
fileId: str = Path(..., description="ID of the file to download"),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> Response:
"""Download a file. Uses mandate/instance context when present (e.g. from feature pages)."""
try:
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
# Get file data
fileData = managementInterface.getFile(fileId)
if not fileData:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"File with ID {fileId} not found"
)
# Get file content
fileContent = managementInterface.getFileData(fileId)
if not fileContent:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"File content not found for ID {fileId}"
)
# Return file as response
# Properly encode filename for Content-Disposition header to handle Unicode characters
import urllib.parse
encoded_filename = urllib.parse.quote(fileData.fileName)
return Response(
content=fileContent,
media_type=fileData.mimeType,
headers={
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"
}
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error downloading file: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error downloading file: {str(e)}"
)
@router.get("/{fileId}/preview", response_model=FilePreview)
@limiter.limit("30/minute")
def preview_file(
request: Request,
fileId: str = Path(..., description="ID of the file to preview"),
currentUser: User = Depends(getCurrentUser),
context: RequestContext = Depends(getRequestContext)
) -> FilePreview:
"""Preview a file's content. Uses mandate/instance context when present."""
try:
managementInterface = interfaceDbManagement.getInterface(
currentUser,
mandateId=str(context.mandateId) if context.mandateId else None,
featureInstanceId=str(context.featureInstanceId) if context.featureInstanceId else None
)
# Get file preview using the correct method
preview = managementInterface.getFileContent(fileId)
if not preview:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"File with ID {fileId} not found or no content available"
)
return preview
except HTTPException:
raise
except Exception as e:
logger.error(f"Error previewing file: {str(e)}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error previewing file: {str(e)}"
)