commit
2554f669c7
1 changed files with 148 additions and 47 deletions
|
|
@ -184,12 +184,14 @@ class MethodDocument(MethodBase):
|
||||||
expectedDocumentFormats (list): Target formats
|
expectedDocumentFormats (list): Target formats
|
||||||
originalDocuments (list, optional): Original names
|
originalDocuments (list, optional): Original names
|
||||||
includeMetadata (bool, optional): Include metadata (default: True)
|
includeMetadata (bool, optional): Include metadata (default: True)
|
||||||
|
mergeDocuments (bool, optional): Merge all documents into single output (default: False)
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
document_list = parameters.get("documentList", [])
|
document_list = parameters.get("documentList", [])
|
||||||
expected_document_formats = parameters.get("expectedDocumentFormats", [])
|
expected_document_formats = parameters.get("expectedDocumentFormats", [])
|
||||||
original_documents = parameters.get("originalDocuments", [])
|
original_documents = parameters.get("originalDocuments", [])
|
||||||
include_metadata = parameters.get("includeMetadata", True)
|
include_metadata = parameters.get("includeMetadata", True)
|
||||||
|
merge_documents = parameters.get("mergeDocuments", False)
|
||||||
|
|
||||||
if not document_list:
|
if not document_list:
|
||||||
return ActionResult.isFailure(
|
return ActionResult.isFailure(
|
||||||
|
|
@ -214,9 +216,8 @@ class MethodDocument(MethodBase):
|
||||||
if not original_documents:
|
if not original_documents:
|
||||||
original_documents = [doc.fileName if hasattr(doc, 'fileName') else str(doc.id) for doc in chat_documents]
|
original_documents = [doc.fileName if hasattr(doc, 'fileName') else str(doc.id) for doc in chat_documents]
|
||||||
|
|
||||||
# Process each document individually with its own format conversion
|
# Extract content from all documents first
|
||||||
output_documents = []
|
document_contents = []
|
||||||
|
|
||||||
for i, chat_document in enumerate(chat_documents):
|
for i, chat_document in enumerate(chat_documents):
|
||||||
# Extract content from this document directly - NO AI, just read the data as-is
|
# Extract content from this document directly - NO AI, just read the data as-is
|
||||||
# This ensures we get the original text content for format conversion
|
# This ensures we get the original text content for format conversion
|
||||||
|
|
@ -256,52 +257,75 @@ class MethodDocument(MethodBase):
|
||||||
|
|
||||||
logger.info(f"Extracted content from document {i+1}: {len(content)} characters")
|
logger.info(f"Extracted content from document {i+1}: {len(content)} characters")
|
||||||
|
|
||||||
# Get the expected format for this document (or use default)
|
document_contents.append({
|
||||||
target_format = None
|
"document": chat_document,
|
||||||
if i < len(expected_document_formats):
|
"content": content,
|
||||||
target_format = expected_document_formats[i]
|
"index": i,
|
||||||
elif len(expected_document_formats) > 0:
|
"original_name": original_documents[i] if i < len(original_documents) else f"document_{i+1}"
|
||||||
# If fewer formats than documents, use the last format for remaining documents
|
|
||||||
target_format = expected_document_formats[-1]
|
|
||||||
|
|
||||||
if not target_format:
|
|
||||||
logger.warning(f"No expected format for document {i+1}, skipping")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Use AI to convert format
|
|
||||||
formatted_content = await self._convertContentToFormat(content, target_format)
|
|
||||||
if not formatted_content:
|
|
||||||
logger.warning(f"Failed to format document {i+1}, skipping")
|
|
||||||
continue
|
|
||||||
|
|
||||||
target_extension = target_format.get("extension", ".txt")
|
|
||||||
target_mime_type = target_format.get("mimeType", "text/plain")
|
|
||||||
|
|
||||||
# Create output fileName
|
|
||||||
timestamp = int(get_utc_timestamp())
|
|
||||||
if i < len(original_documents):
|
|
||||||
base_name = original_documents[i].rsplit('.', 1)[0] if '.' in original_documents[i] else original_documents[i]
|
|
||||||
else:
|
|
||||||
base_name = f"document_{i+1}"
|
|
||||||
output_fileName = f"{base_name}_generated_{self._format_timestamp_for_filename()}{target_extension}"
|
|
||||||
|
|
||||||
# Create result data
|
|
||||||
result_data = {
|
|
||||||
"documentCount": 1,
|
|
||||||
"content": formatted_content,
|
|
||||||
"outputFormat": target_format,
|
|
||||||
"originalDocument": original_documents[i] if i < len(original_documents) else f"document_{i+1}",
|
|
||||||
"timestamp": get_utc_timestamp()
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info(f"Generated document: {output_fileName} with {len(formatted_content)} characters")
|
|
||||||
|
|
||||||
output_documents.append({
|
|
||||||
"documentName": output_fileName,
|
|
||||||
"documentData": result_data,
|
|
||||||
"mimeType": target_mime_type
|
|
||||||
})
|
})
|
||||||
|
|
||||||
|
if not document_contents:
|
||||||
|
return ActionResult.isFailure(
|
||||||
|
error="No valid text content could be extracted from any documents"
|
||||||
|
)
|
||||||
|
|
||||||
|
if merge_documents and len(document_contents) > 1:
|
||||||
|
# Merge all documents into single output
|
||||||
|
logger.info("Merging all documents into single output")
|
||||||
|
return await self._mergeDocuments(document_contents, expected_document_formats, include_metadata)
|
||||||
|
else:
|
||||||
|
# Process each document individually with its own format conversion
|
||||||
|
logger.info("Processing documents individually")
|
||||||
|
output_documents = []
|
||||||
|
|
||||||
|
for item in document_contents:
|
||||||
|
chat_document = item["document"]
|
||||||
|
content = item["content"]
|
||||||
|
i = item["index"]
|
||||||
|
original_name = item["original_name"]
|
||||||
|
|
||||||
|
# Get the expected format for this document (or use default)
|
||||||
|
target_format = None
|
||||||
|
if i < len(expected_document_formats):
|
||||||
|
target_format = expected_document_formats[i]
|
||||||
|
elif len(expected_document_formats) > 0:
|
||||||
|
# If fewer formats than documents, use the last format for remaining documents
|
||||||
|
target_format = expected_document_formats[-1]
|
||||||
|
|
||||||
|
if not target_format:
|
||||||
|
logger.warning(f"No expected format for document {i+1}, skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Use AI to convert format
|
||||||
|
formatted_content = await self._convertContentToFormat(content, target_format)
|
||||||
|
if not formatted_content:
|
||||||
|
logger.warning(f"Failed to format document {i+1}, skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
|
target_extension = target_format.get("extension", ".txt")
|
||||||
|
target_mime_type = target_format.get("mimeType", "text/plain")
|
||||||
|
|
||||||
|
# Create output fileName
|
||||||
|
base_name = original_name.rsplit('.', 1)[0] if '.' in original_name else original_name
|
||||||
|
output_fileName = f"{base_name}_generated_{self._format_timestamp_for_filename()}{target_extension}"
|
||||||
|
|
||||||
|
# Create result data
|
||||||
|
result_data = {
|
||||||
|
"documentCount": 1,
|
||||||
|
"content": formatted_content,
|
||||||
|
"outputFormat": target_format,
|
||||||
|
"originalDocument": original_name,
|
||||||
|
"timestamp": get_utc_timestamp()
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"Generated document: {output_fileName} with {len(formatted_content)} characters")
|
||||||
|
|
||||||
|
output_documents.append({
|
||||||
|
"documentName": output_fileName,
|
||||||
|
"documentData": result_data,
|
||||||
|
"mimeType": target_mime_type
|
||||||
|
})
|
||||||
|
|
||||||
if not output_documents:
|
if not output_documents:
|
||||||
return ActionResult.isFailure(
|
return ActionResult.isFailure(
|
||||||
error="No documents could be generated"
|
error="No documents could be generated"
|
||||||
|
|
@ -316,6 +340,83 @@ class MethodDocument(MethodBase):
|
||||||
error=str(e)
|
error=str(e)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def _mergeDocuments(self, document_contents: List[Dict[str, Any]],
|
||||||
|
expected_document_formats: List[Dict[str, Any]],
|
||||||
|
include_metadata: bool) -> ActionResult:
|
||||||
|
"""
|
||||||
|
Merge all documents into a single output document.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Combine all document content
|
||||||
|
combined_content_parts = []
|
||||||
|
original_file_names = []
|
||||||
|
|
||||||
|
for item in document_contents:
|
||||||
|
chat_document = item["document"]
|
||||||
|
content = item["content"]
|
||||||
|
original_name = item["original_name"]
|
||||||
|
|
||||||
|
if content.strip():
|
||||||
|
combined_content_parts.append(f"=== Document: {original_name} ===\n{content}\n")
|
||||||
|
original_file_names.append(original_name)
|
||||||
|
|
||||||
|
if not combined_content_parts:
|
||||||
|
return ActionResult.isFailure(
|
||||||
|
error="No content could be extracted from any documents for merging"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Combine all content
|
||||||
|
combined_content = "\n".join(combined_content_parts)
|
||||||
|
logger.info(f"Combined content from {len(original_file_names)} documents: {len(combined_content)} characters")
|
||||||
|
|
||||||
|
# Get the expected format for the merged output
|
||||||
|
target_format = None
|
||||||
|
if expected_document_formats and len(expected_document_formats) > 0:
|
||||||
|
target_format = expected_document_formats[0] # Use first format for merged output
|
||||||
|
|
||||||
|
if not target_format:
|
||||||
|
logger.warning("No expected format specified for merged output, using plain text")
|
||||||
|
target_format = {"extension": ".txt", "mimeType": "text/plain"}
|
||||||
|
|
||||||
|
# Use AI to convert format
|
||||||
|
formatted_content = await self._convertContentToFormat(combined_content, target_format)
|
||||||
|
if not formatted_content:
|
||||||
|
logger.warning("Failed to format merged content, using raw content")
|
||||||
|
formatted_content = combined_content
|
||||||
|
|
||||||
|
target_extension = target_format.get("extension", ".txt")
|
||||||
|
target_mime_type = target_format.get("mimeType", "text/plain")
|
||||||
|
|
||||||
|
# Create output fileName for merged document
|
||||||
|
timestamp = self._format_timestamp_for_filename()
|
||||||
|
output_fileName = f"merged_documents_{timestamp}{target_extension}"
|
||||||
|
|
||||||
|
# Create result data for merged document
|
||||||
|
result_data = {
|
||||||
|
"documentCount": len(document_contents),
|
||||||
|
"content": formatted_content,
|
||||||
|
"outputFormat": target_format,
|
||||||
|
"originalDocuments": original_file_names,
|
||||||
|
"timestamp": get_utc_timestamp(),
|
||||||
|
"merged": True
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"Created merged document: {output_fileName} with {len(formatted_content)} characters")
|
||||||
|
|
||||||
|
return ActionResult.isSuccess(
|
||||||
|
documents=[{
|
||||||
|
"documentName": output_fileName,
|
||||||
|
"documentData": result_data,
|
||||||
|
"mimeType": target_mime_type
|
||||||
|
}]
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error merging documents: {str(e)}")
|
||||||
|
return ActionResult.isFailure(
|
||||||
|
error=f"Failed to merge documents: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
async def _convertContentToFormat(self, content: str, target_format: Dict[str, Any]) -> str:
|
async def _convertContentToFormat(self, content: str, target_format: Dict[str, Any]) -> str:
|
||||||
"""
|
"""
|
||||||
Helper function to convert content to the specified format using AI.
|
Helper function to convert content to the specified format using AI.
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue