gateway/modules/workflows/methods/methodSharepoint/helpers/documentParsing.py
2025-12-17 10:45:09 +01:00

252 lines
12 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Document Parsing helper for SharePoint operations.
Handles parsing of document lists and extracting found documents and site information.
"""
import logging
import json
from typing import Dict, Any, List, Optional
logger = logging.getLogger(__name__)
class DocumentParsingHelper:
"""Helper for parsing document lists and extracting document information"""
def __init__(self, methodInstance):
"""
Initialize document parsing helper.
Args:
methodInstance: Instance of MethodSharepoint (for access to services)
"""
self.method = methodInstance
self.services = methodInstance.services
async def parseDocumentListForFoundDocuments(self, documentList: Any) -> tuple[Optional[List[Dict[str, Any]]], Optional[List[Dict[str, Any]]], Optional[str]]:
"""
Parse documentList to extract foundDocuments and site information.
Parameters:
documentList: Document list (can be list, DocumentReferenceList, or string)
Returns:
tuple: (foundDocuments, sites, errorMessage)
- foundDocuments: List of found documents from findDocumentPath result
- sites: List of site dictionaries with id, displayName, webUrl
- errorMessage: Error message if parsing failed, None otherwise
"""
try:
if isinstance(documentList, str):
documentList = [documentList]
# Resolve documentList to get actual documents
from modules.datamodels.datamodelDocref import DocumentReferenceList
if isinstance(documentList, DocumentReferenceList):
docRefList = documentList
elif isinstance(documentList, list):
docRefList = DocumentReferenceList.from_string_list(documentList)
else:
docRefList = DocumentReferenceList(references=[])
chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(docRefList)
if not chatDocuments:
return None, None, "No documents found for the provided document list"
firstDocument = chatDocuments[0]
fileData = self.services.chat.getFileData(firstDocument.fileId)
if not fileData:
return None, None, None # No fileData, but not an error (might be regular file)
try:
resultData = json.loads(fileData)
foundDocuments = resultData.get("foundDocuments", [])
# If no foundDocuments, check if it's a listDocuments result (has listResults)
if not foundDocuments and "listResults" in resultData:
logger.info(f"documentList contains listResults from listDocuments, converting to foundDocuments format")
listResults = resultData.get("listResults", [])
foundDocuments = []
siteIdFromList = None
siteNameFromList = None
for listResult in listResults:
siteResults = listResult.get("siteResults", [])
for siteResult in siteResults:
items = siteResult.get("items", [])
# Extract site info from first item if available
if items and not siteIdFromList:
siteNameFromList = items[0].get("siteName")
for item in items:
# Convert listDocuments item format to foundDocuments format
if item.get("type") == "file":
foundDoc = {
"id": item.get("id"),
"name": item.get("name"),
"type": "file",
"siteName": item.get("siteName"),
"siteId": None, # Will be determined from site discovery
"webUrl": item.get("webUrl"),
"fullPath": item.get("webUrl", ""),
"parentPath": item.get("parentPath", "")
}
foundDocuments.append(foundDoc)
# Discover sites to get siteId if we have siteName
if foundDocuments and siteNameFromList and not siteIdFromList:
logger.info(f"Discovering sites to find siteId for '{siteNameFromList}'")
allSites = await self.method.siteDiscovery.discoverSharePointSites()
matchingSites = self.method.siteDiscovery.filterSitesByHint(allSites, siteNameFromList)
if matchingSites:
siteIdFromList = matchingSites[0].get("id")
# Update all foundDocuments with siteId
for doc in foundDocuments:
doc["siteId"] = siteIdFromList
logger.info(f"Found siteId '{siteIdFromList}' for site '{siteNameFromList}'")
logger.info(f"Converted {len(foundDocuments)} files from listResults format")
if not foundDocuments:
return None, None, None # No foundDocuments, but not an error
# Extract site information from foundDocuments
firstDoc = foundDocuments[0]
siteName = firstDoc.get("siteName")
siteId = firstDoc.get("siteId")
# If siteId is missing (from listDocuments conversion), discover sites to find it
if siteName and not siteId:
logger.info(f"Site ID missing, discovering sites to find siteId for '{siteName}'")
allSites = await self.method.siteDiscovery.discoverSharePointSites()
matchingSites = self.method.siteDiscovery.filterSitesByHint(allSites, siteName)
if matchingSites:
siteId = matchingSites[0].get("id")
logger.info(f"Found siteId '{siteId}' for site '{siteName}'")
sites = None
if siteName and siteId:
sites = [{
"id": siteId,
"displayName": siteName,
"webUrl": firstDoc.get("webUrl", "")
}]
logger.info(f"Using specific site from documentList: {siteName} (ID: {siteId})")
elif siteName:
# Try to get site by name
allSites = await self.method.siteDiscovery.discoverSharePointSites()
matchingSites = self.method.siteDiscovery.filterSitesByHint(allSites, siteName)
if matchingSites:
sites = [{
"id": matchingSites[0].get("id"),
"displayName": siteName,
"webUrl": matchingSites[0].get("webUrl", "")
}]
logger.info(f"Found site by name: {siteName} (ID: {sites[0]['id']})")
else:
return None, None, f"Site '{siteName}' not found. Cannot determine target site."
else:
return None, None, "Site information missing from documentList. Cannot determine target site."
return foundDocuments, sites, None
except json.JSONDecodeError as e:
return None, None, f"Invalid JSON in documentList: {str(e)}"
except Exception as e:
return None, None, f"Error processing documentList: {str(e)}"
except Exception as e:
logger.error(f"Error parsing documentList: {str(e)}")
return None, None, f"Error parsing documentList: {str(e)}"
async def parseDocumentListForFolder(self, documentList: Any) -> tuple[Optional[str], Optional[List[Dict[str, Any]]], Optional[List], Optional[str]]:
"""
Parse documentList to extract folder path, site information, and files to upload.
Parameters:
documentList: Document list (can be list, DocumentReferenceList, or string)
Returns:
tuple: (folderPath, sites, filesToUpload, errorMessage)
- folderPath: Folder path from findDocumentPath result (or None)
- sites: List of site dictionaries with id, displayName, webUrl
- filesToUpload: List of ChatDocument objects to upload (or None)
- errorMessage: Error message if parsing failed, None otherwise
"""
try:
if isinstance(documentList, str):
documentList = [documentList]
# Resolve documentList to get actual documents
from modules.datamodels.datamodelDocref import DocumentReferenceList
if isinstance(documentList, DocumentReferenceList):
docRefList = documentList
elif isinstance(documentList, list):
docRefList = DocumentReferenceList.from_string_list(documentList)
else:
docRefList = DocumentReferenceList(references=[])
chatDocuments = self.services.chat.getChatDocumentsFromDocumentList(docRefList)
if not chatDocuments:
return None, None, None, "No documents found for the provided document list"
# Check if first document is a findDocumentPath result (has foundDocuments)
firstDocument = chatDocuments[0]
fileData = self.services.chat.getFileData(firstDocument.fileId)
folderPath = None
sites = None
filesToUpload = None
if fileData:
try:
resultData = json.loads(fileData)
foundDocuments = resultData.get("foundDocuments", [])
if foundDocuments:
# Extract folder path from first found document
firstDoc = foundDocuments[0]
parentPath = firstDoc.get("parentPath", "")
if parentPath:
folderPath = parentPath
# Extract site information
siteName = firstDoc.get("siteName")
siteId = firstDoc.get("siteId")
if siteName and siteId:
sites = [{
"id": siteId,
"displayName": siteName,
"webUrl": firstDoc.get("webUrl", "")
}]
elif siteName:
# Discover sites to find siteId
allSites = await self.method.siteDiscovery.discoverSharePointSites()
matchingSites = self.method.siteDiscovery.filterSitesByHint(allSites, siteName)
if matchingSites:
sites = [{
"id": matchingSites[0].get("id"),
"displayName": siteName,
"webUrl": matchingSites[0].get("webUrl", "")
}]
# For uploadDocument: filesToUpload are the chatDocuments themselves
# (they contain the files to upload)
filesToUpload = chatDocuments
except json.JSONDecodeError:
# Not a findDocumentPath result - treat as regular files to upload
filesToUpload = chatDocuments
else:
# No fileData - treat as regular files to upload
filesToUpload = chatDocuments
return folderPath, sites, filesToUpload, None
except Exception as e:
logger.error(f"Error parsing documentList for folder: {str(e)}")
return None, None, None, f"Error parsing documentList for folder: {str(e)}"