diff --git a/connectors/BACKUP-connectorDbJson.py b/connectors/BACKUP-connectorDbJson.py
deleted file mode 100644
index f4bdea80..00000000
--- a/connectors/BACKUP-connectorDbJson.py
+++ /dev/null
@@ -1,569 +0,0 @@
-import json
-import os
-from typing import List, Dict, Any, Optional, Union
-import logging
-
-logger = logging.getLogger(__name__)
-
-class DatabaseConnector:
- """
- A connector for JSON-based data storage.
- Provides generic database operations with tenant and user context support.
- """
- def __init__(self, dbHost: str, dbDatabase: str, dbUser: str = None, dbPassword: str = None,
- mandateId: int = None, userId: int = None, skipInitialIdLookup: bool = False):
- """
- Initializes the JSON database connector.
-
- Args:
- dbHost: Directory for the JSON files
- dbDatabase: Database name
- dbUser: Username for authentication (optional)
- dbPassword: API key for authentication (optional)
- mandateId: Context parameter for the tenant
- userId: Context parameter for the user
- skipInitialIdLookup: When True, skips looking up initial IDs for mandateId and userId
- """
- # Store the input parameters
- self.dbHost = dbHost
- self.dbDatabase = dbDatabase
- self.dbUser = dbUser
- self.dbPassword = dbPassword
- self.skipInitialIdLookup = skipInitialIdLookup
-
- # Check if context parameters are set
- if mandateId is None or userId is None:
- raise ValueError("mandateId and userId must be set")
-
- # Ensure the database directory exists
- self.dbFolder = os.path.join(self.dbHost, self.dbDatabase)
- os.makedirs(self.dbFolder, exist_ok=True)
-
- # Cache for loaded data
- self._tablesCache = {}
-
- # Initialize system table
- self._systemTableName = "_system"
- self._initializeSystemTable()
-
- # Temporarily store mandateId and userId
- self._mandateId = mandateId
- self._userId = userId
-
- # If mandateId or userId are 0 and we're not skipping ID lookup, try to use the initial IDs
- if not skipInitialIdLookup:
- if mandateId == 0:
- initialMandateId = self.getInitialId("mandates")
- if initialMandateId is not None:
- self._mandateId = initialMandateId
- logger.info(f"Using initial mandateId: {initialMandateId} instead of 0")
-
- if userId == 0:
- initialUserId = self.getInitialId("users")
- if initialUserId is not None:
- self._userId = initialUserId
- logger.info(f"Using initial userId: {initialUserId} instead of 0")
-
- # Set the effective IDs as properties
- self.mandateId = self._mandateId
- self.userId = self._userId
-
- logger.info(f"DatabaseConnector initialized for directory: {self.dbFolder}")
- logger.debug(f"Context: mandateId={self.mandateId}, userId={self.userId}")
-
- def _initializeSystemTable(self):
- """Initializes the system table if it doesn't exist yet."""
- systemTablePath = self._getTablePath(self._systemTableName)
- if not os.path.exists(systemTablePath):
- emptySystemTable = {}
- self._saveSystemTable(emptySystemTable)
- logger.info(f"System table initialized in {systemTablePath}")
- else:
- # Load existing system table to ensure it's available
- self._loadSystemTable()
- logger.debug(f"Existing system table loaded from {systemTablePath}")
-
- def _loadSystemTable(self) -> Dict[str, int]:
- """Loads the system table with the initial IDs."""
- # Check if system table is in cache
- if f"_{self._systemTableName}" in self._tablesCache:
- return self._tablesCache[f"_{self._systemTableName}"]
-
- systemTablePath = self._getTablePath(self._systemTableName)
- try:
- if os.path.exists(systemTablePath):
- with open(systemTablePath, 'r', encoding='utf-8') as f:
- data = json.load(f)
- # Store in cache with special prefix to avoid collision with regular tables
- self._tablesCache[f"_{self._systemTableName}"] = data
- return data
- else:
- self._tablesCache[f"_{self._systemTableName}"] = {}
- return {}
- except Exception as e:
- logger.error(f"Error loading the system table: {e}")
- self._tablesCache[f"_{self._systemTableName}"] = {}
- return {}
-
- def _saveSystemTable(self, data: Dict[str, int]) -> bool:
- """Saves the system table with the initial IDs."""
- systemTablePath = self._getTablePath(self._systemTableName)
- try:
- with open(systemTablePath, 'w', encoding='utf-8') as f:
- json.dump(data, f, indent=2, ensure_ascii=False)
- # Update cache
- self._tablesCache[f"_{self._systemTableName}"] = data
- return True
- except Exception as e:
- logger.error(f"Error saving the system table: {e}")
- return False
-
- def _getTablePath(self, table: str) -> str:
- """Returns the full path to a table file"""
- return os.path.join(self.dbFolder, f"{table}.json")
-
- def _loadTable(self, table: str) -> List[Dict[str, Any]]:
- """Loads a table from the corresponding JSON file"""
- path = self._getTablePath(table)
-
- # If the table is the system table, load it directly
- if table == self._systemTableName:
- return [] # The system table is not treated like normal tables
-
- # If the table is already in the cache, use the cache
- if table in self._tablesCache:
- return self._tablesCache[table]
-
- # Otherwise load the file
- try:
- if os.path.exists(path):
- with open(path, 'r', encoding='utf-8') as f:
- data = json.load(f)
- self._tablesCache[table] = data
-
- # If data was loaded and no initial ID is registered yet,
- # register the ID of the first record (if available)
- if data and not self.hasInitialId(table):
- if "id" in data[0]:
- self._registerInitialId(table, data[0]["id"])
- logger.info(f"Initial ID {data[0]['id']} for table {table} retroactively registered")
-
- return data
- else:
- # If the file doesn't exist, create an empty table
- logger.info(f"New table {table}")
- self._tablesCache[table] = []
- self._saveTable(table, [])
- return []
- except Exception as e:
- logger.error(f"Error loading table {table}: {e}")
- return []
-
- def _saveTable(self, table: str, data: List[Dict[str, Any]]) -> bool:
- """Saves a table to the corresponding JSON file"""
- # The system table is handled specially
- if table == self._systemTableName:
- return False
-
- path = self._getTablePath(table)
- try:
- with open(path, 'w', encoding='utf-8') as f:
- json.dump(data, f, indent=2, ensure_ascii=False)
-
- # Update the cache
- self._tablesCache[table] = data
- return True
- except Exception as e:
- logger.error(f"Error saving table {table}: {e}")
- return False
-
- def _filterByContext(self, records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
- """
- Filters records by tenant and user context,
- if these fields exist in the record.
- """
- filteredRecords = []
-
- for record in records:
- # Check if mandateId exists in the record and is not null
- hasMandate = "mandateId" in record and record["mandateId"] is not None and record["mandateId"] != ""
-
- # Check if userId exists in the record and is not null
- hasUser = "userId" in record and record["userId"] is not None and record["userId"] != ""
-
- # If both exist, filter accordingly
- if hasMandate and hasUser:
- if record["mandateId"] == self.mandateId:
- filteredRecords.append(record)
- # If only mandateId exists
- elif hasMandate and not hasUser:
- if record["mandateId"] == self.mandateId:
- filteredRecords.append(record)
- # If neither mandateId nor userId exist, add the record
- elif not hasMandate and not hasUser:
- filteredRecords.append(record)
-
- return filteredRecords
-
-
- def _applyRecordFilter(self, records: List[Dict[str, Any]], recordFilter: Dict[str, Any] = None) -> List[Dict[str, Any]]:
- """Applies a record filter to the records"""
- if not recordFilter:
- return records
-
- filteredRecords = []
-
- for record in records:
- match = True
-
- for field, value in recordFilter.items():
- # Check if the field exists
- if field not in record:
- match = False
- break
-
- # Handle type conversion for integer comparisons both ways
- if isinstance(value, int) and isinstance(record[field], str) and record[field].isdigit():
- # Filter value is int, record value is string
- if value != int(record[field]):
- match = False
- break
- elif isinstance(value, str) and value.isdigit() and isinstance(record[field], int):
- # Filter value is string, record value is int
- if record[field] != int(value):
- match = False
- break
- # Otherwise direct comparison
- elif record[field] != value:
- match = False
- break
-
- if match:
- filteredRecords.append(record)
-
- return filteredRecords
-
- def _registerInitialId(self, table: str, initialId: int) -> bool:
- """
- Registers the initial ID for a table.
-
- Args:
- table: Name of the table
- initialId: The initial ID
-
- Returns:
- True on success, False on error
- """
- try:
- # Load the current system table
- systemData = self._loadSystemTable()
-
- # Only register if not already present
- if table not in systemData:
- systemData[table] = initialId
- success = self._saveSystemTable(systemData)
- if success:
- logger.info(f"Initial ID {initialId} for table {table} registered")
- return success
- return True # If already present, this is not an error
- except Exception as e:
- logger.error(f"Error registering the initial ID for table {table}: {e}")
- return False
-
- def _removeInitialId(self, table: str) -> bool:
- """
- Removes the initial ID for a table from the system table.
-
- Args:
- table: Name of the table
-
- Returns:
- True on success, False on error
- """
- try:
- # Load the current system table
- systemData = self._loadSystemTable()
-
- # Remove the entry if it exists
- if table in systemData:
- del systemData[table]
- success = self._saveSystemTable(systemData)
- if success:
- logger.info(f"Initial ID for table {table} removed from system table")
- return success
- return True # If not present, this is not an error
- except Exception as e:
- logger.error(f"Error removing initial ID for table {table}: {e}")
- return False
-
- # Public API
-
- def getTables(self) -> List[str]:
- """
- Returns a list of all available tables.
-
- Returns:
- List of table names
- """
- tables = []
-
- try:
- for filename in os.listdir(self.dbFolder):
- if filename.endswith('.json') and not filename.startswith('_'):
- tableName = filename[:-5] # Remove the .json extension
- tables.append(tableName)
- except Exception as e:
- logger.error(f"Error reading the database directory: {e}")
-
- return tables
-
- def getFields(self, table: str) -> List[str]:
- """
- Returns a list of all fields in a table.
-
- Args:
- table: Name of the table
-
- Returns:
- List of field names
- """
- # Load the table data
- data = self._loadTable(table)
-
- if not data:
- return []
-
- # Take the first record as a reference for the fields
- fields = list(data[0].keys()) if data else []
-
- return fields
-
- def getSchema(self, table: str, language: str = None) -> Dict[str, Dict[str, Any]]:
- """
- Returns a schema object for a table with data types and labels.
-
- Args:
- table: Name of the table
- language: Language for the labels (optional)
-
- Returns:
- Schema object with fields, data types and labels
- """
- # Load the table data
- data = self._loadTable(table)
-
- schema = {}
-
- if not data:
- return schema
-
- # Take the first record as a reference for the fields and data types
- firstRecord = data[0]
-
- for field, value in firstRecord.items():
- # Determine the data type
- dataType = type(value).__name__
-
- # Create label (default is the field name)
- label = field
-
- schema[field] = {
- "type": dataType,
- "label": label
- }
-
- return schema
-
- def getRecordset(self, table: str, fieldFilter: List[str] = None, recordFilter: Dict[str, Any] = None) -> List[Dict[str, Any]]:
- """
- Returns a list of records from a table, filtered by criteria.
-
- Args:
- table: Name of the table
- fieldFilter: Filter for fields (which fields should be returned)
- recordFilter: Filter for records (which records should be returned)
-
- Returns:
- List of filtered records
- """
- # Load the table data
- data = self._loadTable(table)
- logger.debug(f"getRecordset: data volume of {len(data)} bytes")
-
- # Filter by tenant and user context
- filteredData = self._filterByContext(data)
-
- # Apply recordFilter if available
- if recordFilter:
- filteredData = self._applyRecordFilter(filteredData, recordFilter)
-
- # If fieldFilter is available, reduce the fields
- if fieldFilter and isinstance(fieldFilter, list):
- result = []
- for record in filteredData:
- filteredRecord = {}
- for field in fieldFilter:
- if field in record:
- filteredRecord[field] = record[field]
- result.append(filteredRecord)
- return result
-
- return filteredData
-
- def recordCreate(self, table: str, recordData: Dict[str, Any]) -> Dict[str, Any]:
- """
- Creates a new record in the table.
-
- Args:
- table: Name of the table
- recordData: Data for the new record
-
- Returns:
- The created record
- """
- # Load the table data
- data = self._loadTable(table)
-
- # Add mandateId and userId if not present or 0
- if "mandateId" not in recordData or recordData["mandateId"] == 0:
- recordData["mandateId"] = self.mandateId
-
- if "userId" not in recordData or recordData["userId"] == 0:
- recordData["userId"] = self.userId
-
- # Determine the next ID if not present
- if "id" not in recordData:
- nextId = 1
- if data:
- nextId = max(record["id"] for record in data if "id" in record) + 1
- recordData["id"] = nextId
-
- # If the table is empty and a system ID should be registered
- if not data:
- self._registerInitialId(table, recordData["id"])
- logger.info(f"Initial ID {recordData['id']} for table {table} has been registered")
-
- # Add the new record
- data.append(recordData)
-
- # Save the updated table
- if self._saveTable(table, data):
- return recordData
- else:
- raise ValueError(f"Error creating the record in table {table}")
-
- def recordDelete(self, table: str, recordId: Union[str, int]) -> bool:
- """
- Deletes a record from the table.
-
- Args:
- table: Name of the table
- recordId: ID of the record to delete
-
- Returns:
- True on success, False on error
- """
- # Load table data
- data = self._loadTable(table)
-
- # Search for the record
- for i, record in enumerate(data):
- if "id" in record and record["id"] == recordId:
- # Check if the record belongs to the current mandate
- if "mandateId" in record and record["mandateId"] != self.mandateId:
- raise ValueError("Not your mandate")
-
- # Check if it's an initial record
- initialId = self.getInitialId(table)
- if initialId is not None and initialId == recordId:
- # Remove this entry from the system table
- self._removeInitialId(table)
- logger.info(f"Initial ID {recordId} for table {table} has been removed from the system table")
-
- # Delete the record
- del data[i]
-
- # Save the updated table
- return self._saveTable(table, data)
-
- # Record not found
- return False
-
- def recordModify(self, table: str, recordId: Union[str, int], recordData: Dict[str, Any]) -> Dict[str, Any]:
- """
- Modifies a record in the table.
-
- Args:
- table: Name of the table
- recordId: ID of the record to modify
- recordData: New data for the record
-
- Returns:
- The updated record
- """
- # Load table data
- data = self._loadTable(table)
-
- # Search for the record
- for i, record in enumerate(data):
- if "id" in record and record["id"] == recordId:
- # Check if the record belongs to the current mandate
- if "mandateId" in record and record["mandateId"] != self.mandateId:
- raise ValueError("Not your mandate")
-
- # Prevent changing the ID
- if "id" in recordData and recordData["id"] != recordId:
- raise ValueError(f"The ID of a record in table {table} cannot be changed")
-
- # Update the record
- for key, value in recordData.items():
- data[i][key] = value
-
- # Save the updated table
- if self._saveTable(table, data):
- return data[i]
- else:
- raise ValueError(f"Error updating record in table {table}")
-
- # Record not found
- raise ValueError(f"Record with ID {recordId} not found in table {table}")
-
- def hasInitialId(self, table: str) -> bool:
- """
- Checks if an initial ID is registered for a table.
-
- Args:
- table: Name of the table
-
- Returns:
- True if an initial ID is registered, otherwise False
- """
- systemData = self._loadSystemTable()
- return table in systemData
-
- def getInitialId(self, table: str) -> Optional[int]:
- """
- Returns the initial ID for a table.
-
- Args:
- table: Name of the table
-
- Returns:
- The initial ID or None if not present
- """
- systemData = self._loadSystemTable()
- initialId = systemData.get(table)
- logger.debug(f"Database '{self.dbDatabase}': Initial ID for table '{table}' is {initialId}")
- if initialId is None:
- logger.debug(f"No initial ID found for table {table}")
- return initialId
-
- def getAllInitialIds(self) -> Dict[str, int]:
- """
- Returns all registered initial IDs.
-
- Returns:
- Dictionary with table names as keys and initial IDs as values
- """
- systemData = self._loadSystemTable()
- return systemData.copy() # Return a copy to protect the original
\ No newline at end of file
diff --git a/modules/agentAnalyst.py b/modules/agentAnalyst.py
index 5e68c91e..b967af5f 100644
--- a/modules/agentAnalyst.py
+++ b/modules/agentAnalyst.py
@@ -64,9 +64,11 @@ class AgentAnalyst(AgentBase):
}
# Extract data from documents - focusing only on dataExtracted
+ self.mydom.logAdd(task["workflowId"], "Extracting data from documents...", level="info", progress=35)
datasets, documentContext = self._extractData(inputDocuments)
# Generate task analysis to understand what's needed
+ self.mydom.logAdd(task["workflowId"], "Analyzing task requirements...", level="info", progress=45)
analysisPlan = await self._analyzeTask(prompt, documentContext, datasets, outputSpecs)
# Generate all required output documents
@@ -77,7 +79,11 @@ class AgentAnalyst(AgentBase):
outputSpecs = []
# Process each output specification
- for spec in outputSpecs:
+ totalSpecs = len(outputSpecs)
+ for i, spec in enumerate(outputSpecs):
+ progress = 45 + int((i / totalSpecs) * 45) # Progress from 45% to 90%
+ self.mydom.logAdd(task["workflowId"], f"Creating output {i+1}/{totalSpecs}...", level="info", progress=progress)
+
outputLabel = spec.get("label", "")
outputDescription = spec.get("description", "")
@@ -106,9 +112,9 @@ class AgentAnalyst(AgentBase):
documents.append(document)
# Generate feedback
- feedback = f"{analysisPlan.get('analysisApproach')}"
- if analysisPlan.get("keyInsights"):
- feedback += f"\n\n{analysisPlan.get('keyInsights')}"
+ feedback = f"{analysisPlan.get('feedback')}"
+ if analysisPlan.get("insights"):
+ feedback += f"\n\n{analysisPlan.get('insights')}"
return {
"feedback": feedback,
@@ -196,69 +202,74 @@ class AgentAnalyst(AgentBase):
return datasets, documentContext
- async def _analyzeTask(self, prompt: str, context: str, datasets: Dict, outputSpecs: List) -> Dict:
+ async def _analyzeTask(self, prompt: str, documentContext: str, datasets: Dict[str, Any], outputSpecs: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
- Use AI to analyze the task and create a plan for analysis.
+ Analyze the task requirements using AI.
Args:
prompt: The task prompt
- context: Document context text
- datasets: Dictionary of extracted datasets
+ documentContext: Context from input documents
+ datasets: Available datasets
outputSpecs: Output specifications
Returns:
Analysis plan dictionary
"""
- # Prepare dataset information
- datasetInfo = {}
- for name, df in datasets.items():
- try:
- datasetInfo[name] = {
- "shape": df.shape,
- "columns": df.columns.tolist(),
- "dtypes": {col: str(df[col].dtype) for col in df.columns},
- "sample": df.head(3).to_dict(orient='records')
- }
- except:
- datasetInfo[name] = {"error": "Could not process dataset"}
-
+ # Create analysis prompt
analysisPrompt = f"""
- Analyze this data analysis task and create a plan.
+ Analyze this data analysis task and create a detailed plan:
TASK: {prompt}
- AVAILABLE DATA:
- {json.dumps(datasetInfo, indent=2)}
-
DOCUMENT CONTEXT:
- {context[:1000]}... (truncated)
+ {documentContext}
- OUTPUT REQUIREMENTS:
+ AVAILABLE DATASETS:
+ {json.dumps(datasets, indent=2)}
+
+ REQUIRED OUTPUTS:
{json.dumps(outputSpecs, indent=2)}
- Create a detailed analysis plan in JSON format with the following structure:
+ Create a detailed analysis plan in JSON format with:
{{
- "analysisType": "statistical|trend|comparative|predictive|cluster|general",
- "keyQuestions": ["question1", "question2"],
- "recommendedVisualizations": [{{
- "type": "chart_type",
- "dataSource": "dataset_name",
- "variables": ["col1", "col2"],
- "purpose": "explanation"
- }}],
- "keyInsights": "brief summary of initial insights",
- "analysisApproach": "brief description of recommended approach"
+ "analysisSteps": [
+ {{
+ "step": "step description",
+ "purpose": "why this step is needed",
+ "datasets": ["dataset1", "dataset2"],
+ "techniques": ["technique1", "technique2"],
+ "outputs": ["output1", "output2"]
+ }}
+ ],
+ "visualizations": [
+ {{
+ "type": "visualization type",
+ "purpose": "what it shows",
+ "datasets": ["dataset1"],
+ "settings": {{"key": "value"}}
+ }}
+ ],
+ "insights": [
+ {{
+ "type": "insight type",
+ "description": "what to look for",
+ "datasets": ["dataset1"]
+ }}
+ ],
+ "feedback": "explanation of the analysis approach"
}}
- Only return valid JSON. No preamble or explanations.
+ Respond with ONLY the JSON object, no additional text or explanations.
"""
+
try:
+ # Get analysis plan from AI
response = await self.mydom.callAi([
- {"role": "system", "content": "You are a data analysis expert. Respond with valid JSON only."},
+ {"role": "system", "content": "You are a data analysis expert. Create detailed analysis plans. Respond with valid JSON only."},
{"role": "user", "content": analysisPrompt}
- ], produceUserAnswer = True)
+ ], produceUserAnswer=True)
- # Extract JSON from response
+ # Extract JSON
jsonStart = response.find('{')
jsonEnd = response.rfind('}') + 1
@@ -266,154 +277,249 @@ class AgentAnalyst(AgentBase):
plan = json.loads(response[jsonStart:jsonEnd])
return plan
else:
- # Fallback if JSON not found
+ # Fallback plan
+ logger.warning(f"Not able creating analysis plan, generating fallback plan")
return {
- "analysisType": "general",
- "keyQuestions": ["What insights can be extracted from this data?"],
- "recommendedVisualizations": [],
- "keyInsights": "Analysis plan could not be created",
- "analysisApproach": "General exploratory analysis"
+ "analysisSteps": [
+ {
+ "step": "Basic data analysis",
+ "purpose": "Understand the data structure and content",
+ "datasets": list(datasets.keys()),
+ "techniques": ["summary statistics", "data visualization"],
+ "outputs": ["summary report", "basic visualizations"]
+ }
+ ],
+ "visualizations": [
+ {
+ "type": "basic charts",
+ "purpose": "Show data distribution and relationships",
+ "datasets": list(datasets.keys()),
+ "settings": {}
+ }
+ ],
+ "insights": [
+ {
+ "type": "basic insights",
+ "description": "Key findings from the data",
+ "datasets": list(datasets.keys())
+ }
+ ],
+ "feedback": f"I'll analyze the data and provide insights about {prompt}"
}
except Exception as e:
logger.warning(f"Error creating analysis plan: {str(e)}")
+ # Simple fallback plan
return {
- "analysisType": "general",
- "keyQuestions": ["What insights can be extracted from this data?"],
- "recommendedVisualizations": [],
- "keyInsights": "Analysis plan could not be created",
- "analysisApproach": "General exploratory analysis"
+ "analysisSteps": [
+ {
+ "step": "Basic data analysis",
+ "purpose": "Understand the data structure and content",
+ "datasets": list(datasets.keys()),
+ "techniques": ["summary statistics", "data visualization"],
+ "outputs": ["summary report", "basic visualizations"]
+ }
+ ],
+ "visualizations": [
+ {
+ "type": "basic charts",
+ "purpose": "Show data distribution and relationships",
+ "datasets": list(datasets.keys()),
+ "settings": {}
+ }
+ ],
+ "insights": [
+ {
+ "type": "basic insights",
+ "description": "Key findings from the data",
+ "datasets": list(datasets.keys())
+ }
+ ],
+ "feedback": f"I'll analyze the data and provide insights about {prompt}"
}
async def _createVisualization(self, datasets: Dict, prompt: str, outputLabel: str,
analysisPlan: Dict, description: str) -> Dict:
"""
- Create visualization document using AI guidance.
+ Create a visualization based on the analysis plan.
Args:
datasets: Dictionary of datasets
prompt: Original task prompt
- outputLabel: Output filename
- analysisPlan: Analysis plan from AI
+ outputLabel: Output file label
+ analysisPlan: Analysis plan
description: Output description
Returns:
- Visualization document
+ Document dictionary with visualization
"""
- # Determine format from filename
- formatType = outputLabel.split('.')[-1].lower()
- if formatType not in ['png', 'jpg', 'jpeg', 'svg']:
- formatType = 'png'
-
- # If no datasets available, create error message image
- if not datasets:
- plt.figure(figsize=(10, 6))
- plt.text(0.5, 0.5, "No data available for visualization",
- ha='center', va='center', fontsize=14)
- plt.tight_layout()
- imgData = self._getImageBase64(formatType)
- plt.close()
-
- return {
- "label": outputLabel,
- "content": imgData,
- "metadata": {
- "contentType": f"image/{formatType}"
- }
- }
-
- # Get recommended visualization from plan
- recommendedViz = analysisPlan.get("recommendedVisualizations", [])
-
- # Prepare dataset info for the first dataset if none specified
- if not recommendedViz and datasets:
- name, df = next(iter(datasets.items()))
- recommendedViz = [{
- "type": "auto",
- "dataSource": name,
- "variables": df.columns.tolist()[:5],
- "purpose": "general analysis"
- }]
-
- # Create visualization code prompt
- vizPrompt = f"""
- Generate Python matplotlib/seaborn code to create a visualization for:
-
- TASK: {prompt}
-
- VISUALIZATION REQUIREMENTS:
- - Output format: {formatType}
- - Filename: {outputLabel}
- - Description: {description}
-
- RECOMMENDED VISUALIZATION:
- {json.dumps(recommendedViz, indent=2)}
-
- AVAILABLE DATASETS:
- """
-
- # Add dataset info for recommended sources
- for viz in recommendedViz:
- dataSource = viz.get("dataSource")
- if dataSource in datasets:
- df = datasets[dataSource]
- vizPrompt += f"\nDataset '{dataSource}':\n"
- vizPrompt += f"- Shape: {df.shape}\n"
- vizPrompt += f"- Columns: {df.columns.tolist()}\n"
- vizPrompt += f"- Sample data: {df.head(3).to_dict(orient='records')}\n"
-
- vizPrompt += """
- Generate ONLY Python code that:
- 1. Uses matplotlib and/or seaborn to create a clear visualization
- 2. Sets figure size to (10, 6)
- 3. Includes appropriate titles, labels, and legend
- 4. Uses professional color schemes
- 5. Handles any missing data gracefully
-
- Return ONLY executable Python code, no explanations or markdown.
- """
-
try:
- # Get visualization code from AI
- vizCode = await self.mydom.callAi([
- {"role": "system", "content": "You are a data visualization expert. Provide only executable Python code."},
- {"role": "user", "content": vizPrompt}
- ], produceUserAnswer = True)
+ # Get visualization recommendations
+ vizRecommendations = analysisPlan.get("visualizations", [])
- # Clean code
- vizCode = vizCode.replace("```python", "").replace("```", "").strip()
-
- # Execute visualization code
- plt.figure(figsize=(10, 6))
-
- # Make local variables available to the code
- localVars = {
- "plt": plt,
- "sns": sns,
- "pd": pd,
- "np": __import__('numpy')
- }
-
- # Add datasets to local variables
- for name, df in datasets.items():
- # Create a sanitized variable name
- varName = ''.join(c if c.isalnum() else '_' for c in name)
- localVars[varName] = df
+ if not vizRecommendations:
+ # Generate visualization recommendations if none provided
+ self.mydom.logAdd(analysisPlan.get("workflowId"), "Generating visualization recommendations...", level="info", progress=50)
+ vizPrompt = f"""
+ Based on this data and task, recommend appropriate visualizations.
- # Also add with standard names for simpler code
- if "df" not in localVars:
- localVars["df"] = df
- elif "df2" not in localVars:
- localVars["df2"] = df
+ TASK: {prompt}
+ DESCRIPTION: {description}
+
+ DATASETS:
+ {json.dumps({name: {"shape": df.shape, "columns": df.columns.tolist()}
+ for name, df in datasets.items()}, indent=2)}
+
+ Recommend visualizations in JSON format:
+ {{
+ "visualizations": [
+ {{
+ "type": "chart_type",
+ "dataSource": "dataset_name",
+ "variables": ["col1", "col2"],
+ "purpose": "explanation"
+ }}
+ ]
+ }}
+ """
+
+ response = await self.mydom.callAi([
+ {"role": "system", "content": "You are a data visualization expert. Recommend appropriate visualizations based on the data and task."},
+ {"role": "user", "content": vizPrompt}
+ ])
+
+ # Extract JSON
+ jsonStart = response.find('{')
+ jsonEnd = response.rfind('}') + 1
+
+ if jsonStart >= 0 and jsonEnd > jsonStart:
+ vizData = json.loads(response[jsonStart:jsonEnd])
+ vizRecommendations = vizData.get("visualizations", [])
- # Execute the visualization code
- exec(vizCode, globals(), localVars)
+ # Determine format from filename
+ formatType = outputLabel.split('.')[-1].lower()
+ if formatType not in ['png', 'jpg', 'jpeg', 'svg']:
+ formatType = 'png'
- # Capture the image
- imgData = self._getImageBase64(formatType)
- plt.close()
+ # If no datasets available, create error message image
+ if not datasets:
+ plt.figure(figsize=(10, 6))
+ plt.text(0.5, 0.5, "No data available for visualization",
+ ha='center', va='center', fontsize=14)
+ plt.tight_layout()
+ imgData = self._getImageBase64(formatType)
+ plt.close()
+
+ return {
+ "label": outputLabel,
+ "content": imgData,
+ "metadata": {
+ "contentType": f"image/{formatType}"
+ }
+ }
- return self.formatAgentDocumentOutput(outputLabel, imgData, f"image/{formatType}")
+ # Prepare dataset info for the first dataset if none specified
+ if not vizRecommendations and datasets:
+ name, df = next(iter(datasets.items()))
+ vizRecommendations = [{
+ "type": "auto",
+ "dataSource": name,
+ "variables": df.columns.tolist()[:5],
+ "purpose": "general analysis"
+ }]
+
+ # Create visualization code prompt
+ vizPrompt = f"""
+ Generate Python matplotlib/seaborn code to create a visualization for:
+
+ TASK: {prompt}
+
+ VISUALIZATION REQUIREMENTS:
+ - Output format: {formatType}
+ - Filename: {outputLabel}
+ - Description: {description}
+
+ RECOMMENDED VISUALIZATION:
+ {json.dumps(vizRecommendations, indent=2)}
+
+ AVAILABLE DATASETS:
+ """
+
+ # Add dataset info for recommended sources
+ for viz in vizRecommendations:
+ dataSource = viz.get("dataSource")
+ if dataSource in datasets:
+ df = datasets[dataSource]
+ vizPrompt += f"\nDataset '{dataSource}':\n"
+ vizPrompt += f"- Shape: {df.shape}\n"
+ vizPrompt += f"- Columns: {df.columns.tolist()}\n"
+ vizPrompt += f"- Sample data: {df.head(3).to_dict(orient='records')}\n"
+
+ vizPrompt += """
+ Generate ONLY Python code that:
+ 1. Uses matplotlib and/or seaborn to create a clear visualization
+ 2. Sets figure size to (10, 6)
+ 3. Includes appropriate titles, labels, and legend
+ 4. Uses professional color schemes
+ 5. Handles any missing data gracefully
+
+ Return ONLY executable Python code, no explanations or markdown.
+ """
+
+ try:
+ # Get visualization code from AI
+ vizCode = await self.mydom.callAi([
+ {"role": "system", "content": "You are a data visualization expert. Provide only executable Python code."},
+ {"role": "user", "content": vizPrompt}
+ ], produceUserAnswer = True)
+
+ # Clean code
+ vizCode = vizCode.replace("```python", "").replace("```", "").strip()
+
+ # Execute visualization code
+ plt.figure(figsize=(10, 6))
+
+ # Make local variables available to the code
+ localVars = {
+ "plt": plt,
+ "sns": sns,
+ "pd": pd,
+ "np": __import__('numpy')
+ }
+
+ # Add datasets to local variables
+ for name, df in datasets.items():
+ # Create a sanitized variable name
+ varName = ''.join(c if c.isalnum() else '_' for c in name)
+ localVars[varName] = df
+
+ # Also add with standard names for simpler code
+ if "df" not in localVars:
+ localVars["df"] = df
+ elif "df2" not in localVars:
+ localVars["df2"] = df
+
+ # Execute the visualization code
+ exec(vizCode, globals(), localVars)
+
+ # Capture the image
+ imgData = self._getImageBase64(formatType)
+ plt.close()
+
+ return self.formatAgentDocumentOutput(outputLabel, imgData, f"image/{formatType}")
+
+ except Exception as e:
+ logger.error(f"Error creating visualization: {str(e)}", exc_info=True)
+
+ # Create error message image
+ plt.figure(figsize=(10, 6))
+ plt.text(0.5, 0.5, f"Visualization error: {str(e)}",
+ ha='center', va='center', fontsize=12)
+ plt.tight_layout()
+ imgData = self._getImageBase64(formatType)
+ plt.close()
+
+ return self.formatAgentDocumentOutput(outputLabel, imgData, f"image/{formatType}")
except Exception as e:
logger.error(f"Error creating visualization: {str(e)}", exc_info=True)
diff --git a/modules/agentEmail.py b/modules/agentEmail.py
index 0a6482ab..6686b725 100644
--- a/modules/agentEmail.py
+++ b/modules/agentEmail.py
@@ -81,6 +81,7 @@ class AgentEmail(AgentBase):
# Extract task information
prompt = task.get("prompt", "")
inputDocuments = task.get("inputDocuments", [])
+ outputSpecs = task.get("outputSpecifications", [])
# Check AI service
if not self.mydom:
@@ -128,22 +129,36 @@ class AgentEmail(AgentBase):
# Prepare output documents
documents = []
- # Add HTML preview document
- previewDoc = self.formatAgentDocumentOutput(
- "email_preview.html",
- htmlPreview,
- "text/html"
- )
- documents.append(previewDoc)
-
- # Add email template as JSON for reference
- templateJson = json.dumps(emailTemplate, indent=2)
- templateDoc = self.formatAgentDocumentOutput(
- "email_template.json",
- templateJson,
- "application/json"
- )
- documents.append(templateDoc)
+ # Process output specifications
+ for spec in outputSpecs:
+ label = spec.get("label", "")
+ description = spec.get("description", "")
+
+ if label.endswith(".html"):
+ # Create the HTML template file
+ templateDoc = self.formatAgentDocumentOutput(
+ label,
+ emailTemplate["htmlBody"], # Use the actual HTML body, not the preview
+ "text/html"
+ )
+ documents.append(templateDoc)
+ elif label.endswith(".json"):
+ # Create JSON template if requested
+ templateJson = json.dumps(emailTemplate, indent=2)
+ templateDoc = self.formatAgentDocumentOutput(
+ label,
+ templateJson,
+ "application/json"
+ )
+ documents.append(templateDoc)
+ else:
+ # Default to preview for other cases
+ previewDoc = self.formatAgentDocumentOutput(
+ label,
+ htmlPreview,
+ "text/html"
+ )
+ documents.append(previewDoc)
# Prepare feedback message
if draft_result:
@@ -230,18 +245,19 @@ class AgentEmail(AgentBase):
# Add document name to contents
documentContents.append(f"\n\n--- {docName} ---\n")
- # Check if document has data to attach
+ # Process document data directly
if doc.get("data"):
- # Add to attachments
+ # Add to attachments with proper metadata
attachments.append({
"name": docName,
- "document": doc
+ "document": {
+ "data": doc["data"],
+ "mimeType": doc.get("mimeType", "application/octet-stream"),
+ "base64Encoded": doc.get("base64Encoded", False)
+ }
})
-
- # Add document name to contents
documentContents.append(f"Document attached: {docName}")
else:
- # If no data, just add the name
documentContents.append(f"Document referenced: {docName}")
return "\n".join(documentContents), attachments
@@ -282,7 +298,7 @@ class AgentEmail(AgentBase):
try:
response = await self.mydom.callAi([
- {"role": "system", "content": "You are an email template specialist. Respond with valid JSON only."},
+ {"role": "system", "content": "You are an email template specialist. Create professional emails. Respond with valid JSON only."},
{"role": "user", "content": emailPrompt}
], produceUserAnswer=True)
@@ -294,7 +310,8 @@ class AgentEmail(AgentBase):
template = json.loads(response[jsonStart:jsonEnd])
return template
else:
- # Fallback if JSON not found
+ # Fallback plan
+ logger.warning(f"Not able creating email template, generating fallback plan")
return {
"recipient": "recipient@example.com",
"subject": "Information Regarding Your Request",
@@ -471,8 +488,8 @@ class AgentEmail(AgentBase):
def _createGraphDraftEmail(self, access_token, recipient, subject, body, attachments=None):
"""
- Create a draft email using Microsoft Graph API with fixed attachment handling.
- Uses the document data directly for attachments.
+ Create a draft email using Microsoft Graph API.
+ Treats all files as binary attachments without content analysis.
Args:
access_token: Microsoft Graph access token
@@ -521,87 +538,69 @@ class AgentEmail(AgentBase):
logger.warning(f"No data found for attachment: {file_name}")
continue
- # Get content type and base64 flag
- content_type = doc.get('contentType', 'application/octet-stream')
+ # Get content type from document metadata
+ mime_type = doc.get('mimeType', 'application/octet-stream')
is_base64 = doc.get('base64Encoded', False)
- # Handle base64 encoding if needed
- if not is_base64:
- logger.info(f"Base64 encoding content for {file_name}")
- if isinstance(file_content, str):
- try:
- # Check if already valid base64
- base64.b64decode(file_content)
- logger.info("Content appears to be valid base64 already")
- except:
- # Not valid base64, encode it
- logger.info("Encoding string content to base64")
- file_content = base64.b64encode(file_content.encode('utf-8')).decode('utf-8')
- elif isinstance(file_content, bytes):
- logger.info("Encoding bytes content to base64")
- file_content = base64.b64encode(file_content).decode('utf-8')
-
- # Calculate actual size from base64 content
+ # Handle content encoding
try:
- decoded_size = len(base64.b64decode(file_content))
+ if is_base64:
+ # Content is already base64 encoded
+ content_bytes = file_content
+ else:
+ # Content needs to be base64 encoded
+ if isinstance(file_content, str):
+ # For text files, encode the string to bytes first
+ content_bytes = base64.b64encode(file_content.encode('utf-8')).decode('utf-8')
+ elif isinstance(file_content, bytes):
+ # For binary files, encode directly
+ content_bytes = base64.b64encode(file_content).decode('utf-8')
+ else:
+ logger.warning(f"Unexpected content type for {file_name}")
+ continue
+
+ # Calculate size from decoded content
+ decoded_size = len(base64.b64decode(content_bytes))
+
+ # Add attachment to email data
+ logger.info(f"Adding attachment: {file_name} ({mime_type}, size: {decoded_size} bytes)")
+ attachment_data = {
+ '@odata.type': '#microsoft.graph.fileAttachment',
+ 'name': file_name,
+ 'contentType': mime_type,
+ 'contentBytes': content_bytes,
+ 'isInline': False,
+ 'size': decoded_size
+ }
+ email_data['attachments'].append(attachment_data)
+ logger.info(f"Successfully added attachment: {file_name}")
+
except Exception as e:
- logger.error(f"Error calculating size for {file_name}: {str(e)}")
- decoded_size = 0
-
- # Add attachment to email data
- logger.info(f"Adding attachment: {file_name} ({content_type}, size: {decoded_size} bytes)")
- attachment_data = {
- '@odata.type': '#microsoft.graph.fileAttachment',
- 'name': file_name,
- 'contentType': content_type,
- 'contentBytes': file_content,
- 'isInline': False,
- 'size': decoded_size
- }
- email_data['attachments'].append(attachment_data)
- logger.info(f"Successfully added attachment: {file_name}")
+ logger.error(f"Error processing attachment {file_name}: {str(e)}")
+ continue
# Try to create draft using drafts folder endpoint
try:
- logger.info("Attempting to create draft email using drafts folder endpoint")
+ logger.info("Attempting to create draft email using messages endpoint")
logger.info(f"Email data structure: subject={subject}, recipient={recipient}, " +
f"has_attachments={bool(email_data.get('attachments'))}, " +
f"attachment_count={len(email_data.get('attachments', []))}")
- # Log the full email data structure for debugging
- logger.debug(f"Full email data structure: {json.dumps(email_data, indent=2)}")
-
- # First create the draft message
+ # Create the draft message
response = requests.post(
- 'https://graph.microsoft.com/v1.0/me/mailFolders/drafts/messages',
+ 'https://graph.microsoft.com/v1.0/me/messages',
headers=headers,
json=email_data
)
if response.status_code >= 200 and response.status_code < 300:
- logger.info("Successfully created draft email using drafts folder endpoint")
+ logger.info("Successfully created draft email using messages endpoint")
return response.json()
else:
- logger.error(f"Drafts folder method failed: {response.status_code} - {response.text}")
+ logger.error(f"Messages endpoint method failed: {response.status_code} - {response.text}")
logger.error(f"Request headers: {headers}")
logger.error(f"Request body: {json.dumps(email_data, indent=2)}")
-
- # Try fallback method with messages endpoint
- logger.info("Trying fallback with messages endpoint")
- response = requests.post(
- 'https://graph.microsoft.com/v1.0/me/messages',
- headers=headers,
- json=email_data
- )
-
- if response.status_code >= 200 and response.status_code < 300:
- logger.info("Successfully created draft email using messages endpoint")
- return response.json()
- else:
- logger.error(f"Messages endpoint method also failed: {response.status_code} - {response.text}")
- logger.error(f"Request headers: {headers}")
- logger.error(f"Request body: {json.dumps(email_data, indent=2)}")
- return None
+ return None
except Exception as e:
logger.error(f"Exception creating draft email: {str(e)}", exc_info=True)
diff --git a/modules/agentWebcrawler.py b/modules/agentWebcrawler.py
index 7f5cad09..7dd87825 100644
--- a/modules/agentWebcrawler.py
+++ b/modules/agentWebcrawler.py
@@ -77,6 +77,7 @@ class AgentWebcrawler(AgentBase):
}
# Create research plan
+ self.mydom.logAdd(task["workflowId"], "Creating research plan...", level="info", progress=35)
researchPlan = await self._createResearchPlan(prompt)
# Check if this is truly a web research task
@@ -87,9 +88,11 @@ class AgentWebcrawler(AgentBase):
}
# Gather raw material through web research
+ self.mydom.logAdd(task["workflowId"], "Gathering research material...", level="info", progress=45)
rawResults = await self._gatherResearchMaterial(researchPlan)
# Format results into requested output documents
+ self.mydom.logAdd(task["workflowId"], "Creating output documents...", level="info", progress=55)
documents = await self._createOutputDocuments(
prompt,
rawResults,
@@ -142,9 +145,9 @@ class AgentWebcrawler(AgentBase):
try:
# Get research plan from AI
response = await self.mydom.callAi([
- {"role": "system", "content": "You are a web research planning expert. Create precise research plans in JSON format only."},
+ {"role": "system", "content": "You are a web research planning expert. Create precise research plans. Respond with valid JSON only."},
{"role": "user", "content": researchPrompt}
- ])
+ ], produceUserAnswer=True)
# Extract JSON
jsonStart = response.find('{')
@@ -202,7 +205,9 @@ class AgentWebcrawler(AgentBase):
# Process direct URLs
directUrls = researchPlan.get("directUrls", [])[:self.maxUrl]
- for url in directUrls:
+ for i, url in enumerate(directUrls):
+ progress = 45 + int((i / len(directUrls)) * 5) # Progress from 45% to 50%
+ self.mydom.logAdd(researchPlan.get("workflowId"), f"Processing direct URL {i+1}/{len(directUrls)}...", level="info", progress=progress)
logger.info(f"Processing direct URL: {url}")
try:
# Fetch and extract content
@@ -226,7 +231,9 @@ class AgentWebcrawler(AgentBase):
# Process search terms
searchTerms = researchPlan.get("searchTerms", [])[:self.maxSearchTerms]
- for term in searchTerms:
+ for i, term in enumerate(searchTerms):
+ progress = 50 + int((i / len(searchTerms)) * 5) # Progress from 50% to 55%
+ self.mydom.logAdd(researchPlan.get("workflowId"), f"Searching term {i+1}/{len(searchTerms)}...", level="info", progress=progress)
logger.info(f"Searching for: {term}")
try:
# Perform search
@@ -302,19 +309,15 @@ class AgentWebcrawler(AgentBase):
Only include information actually found in the content. No fabrications or assumptions.
"""
- if self.mydom:
- summary = await self.mydom.callAi([
- {"role": "system", "content": "You summarize web content accurately and concisely, focusing only on what is actually in the content."},
- {"role": "user", "content": summaryPrompt}
- ])
-
- # Store the summary
- result["summary"] = summary
- else:
- # Fallback if no AI service
- logger.warning(f"Not able to summarize result, using fallback plan.")
- result["summary"] = f"Content from {result['url']} ({len(content)} characters)"
-
+ # Get summary from AI
+ summary = await self.mydom.callAi([
+ {"role": "system", "content": "You are a web content summarization expert. Create concise summaries."},
+ {"role": "user", "content": summaryPrompt}
+ ], produceUserAnswer=True)
+
+ # Add summary to result
+ result["summary"] = summary.strip()
+
except Exception as e:
logger.warning(f"Error summarizing result {i+1}: {str(e)}")
result["summary"] = f"Error creating summary: {str(e)}"
diff --git a/modules/documentProcessor.py b/modules/documentProcessor.py
index d3b637e1..3e082578 100644
--- a/modules/documentProcessor.py
+++ b/modules/documentProcessor.py
@@ -38,8 +38,50 @@ def getDocumentContents(fileMetadata: Dict[str, Any], fileContent: bytes) -> Lis
# Extract content based on MIME type
contents = []
+ # Try to detect actual file type from content for unknown MIME types
+ if mimeType == "application/octet-stream":
+ # Check file extension first
+ ext = os.path.splitext(fileName)[1].lower()
+ if ext:
+ # Map common extensions to MIME types
+ ext_to_mime = {
+ '.txt': 'text/plain',
+ '.md': 'text/markdown',
+ '.csv': 'text/csv',
+ '.json': 'application/json',
+ '.xml': 'application/xml',
+ '.js': 'application/javascript',
+ '.py': 'application/x-python',
+ '.svg': 'image/svg+xml',
+ '.jpg': 'image/jpeg',
+ '.jpeg': 'image/jpeg',
+ '.png': 'image/png',
+ '.gif': 'image/gif',
+ '.pdf': 'application/pdf',
+ '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+ '.doc': 'application/msword',
+ '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+ '.xls': 'application/vnd.ms-excel',
+ '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+ '.ppt': 'application/vnd.ms-powerpoint'
+ }
+ if ext in ext_to_mime:
+ mimeType = ext_to_mime[ext]
+ logger.info(f"Detected MIME type {mimeType} from extension {ext}")
+ else:
+ logger.warning(f"Unknown file extension {ext} for file {fileName}")
+
+ # Try to detect if it's text content
+ try:
+ text_content = fileContent.decode('utf-8')
+ logger.info(f"Successfully decoded file {fileName} as text")
+ contents.extend(extractTextContent(fileName, fileContent, "text/plain"))
+ except UnicodeDecodeError:
+ logger.info(f"File {fileName} is not text, treating as binary")
+ contents.extend(extractBinaryContent(fileName, fileContent, mimeType))
+
# Text-based formats (excluding CSV which has its own handler)
- if mimeType == "text/csv":
+ elif mimeType == "text/csv":
contents.extend(extractCsvContent(fileName, fileContent))
# Then handle other text-based formats
@@ -86,6 +128,7 @@ def getDocumentContents(fileMetadata: Dict[str, Any], fileContent: bytes) -> Lis
# Binary data as fallback for unknown formats
else:
+ logger.warning(f"Unknown MIME type {mimeType} for file {fileName}, treating as binary")
contents.extend(extractBinaryContent(fileName, fileContent, mimeType))
# Fallback when no content could be extracted
@@ -99,7 +142,7 @@ def getDocumentContents(fileMetadata: Dict[str, Any], fileContent: bytes) -> Lis
"sequenceNr": 1,
"name": '1_undefined',
"ext": os.path.splitext(fileName)[1][1:] if os.path.splitext(fileName)[1] else "bin",
- "contentType": mimeType,
+ "mimeType": mimeType,
"data": encoded_data,
"base64Encoded": True,
"metadata": {
@@ -130,13 +173,13 @@ def getDocumentContents(fileMetadata: Dict[str, Any], fileContent: bytes) -> Lis
return contents
except Exception as e:
- logger.error(f"Error during content extraction: {str(e)}")
+ logger.error(f"Error during content extraction for file {fileMetadata.get('name', 'unknown')}: {str(e)}", exc_info=True)
# Fallback on error - return original data
return [{
"sequenceNr": 1,
"name": fileMetadata.get("name", "unknown"),
"ext": os.path.splitext(fileMetadata.get("name", ""))[1][1:] if os.path.splitext(fileMetadata.get("name", ""))[1] else "bin",
- "contentType": fileMetadata.get("mimeType", "application/octet-stream"),
+ "mimeType": fileMetadata.get("mimeType", "application/octet-stream"),
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -206,7 +249,7 @@ def extractTextContent(fileName: str, fileContent: bytes, mimeType: str) -> List
"sequenceNr": 1,
"name": "1_text", # Simplified naming
"ext": fileExtension,
- "contentType": "text/plain",
+ "mimeType": "text/plain",
"data": textContent,
"base64Encoded": False,
"metadata": {
@@ -225,7 +268,7 @@ def extractTextContent(fileName: str, fileContent: bytes, mimeType: str) -> List
"sequenceNr": 1,
"name": "1_text", # Simplified naming
"ext": fileExtension,
- "contentType": "text/plain",
+ "mimeType": "text/plain",
"data": textContent,
"base64Encoded": False,
"metadata": {
@@ -242,7 +285,7 @@ def extractTextContent(fileName: str, fileContent: bytes, mimeType: str) -> List
"sequenceNr": 1,
"name": "1_binary", # Simplified naming
"ext": fileExtension,
- "contentType": mimeType,
+ "mimeType": mimeType,
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -256,7 +299,7 @@ def extractTextContent(fileName: str, fileContent: bytes, mimeType: str) -> List
"sequenceNr": 1,
"name": "1_binary", # Simplified naming
"ext": fileExtension,
- "contentType": mimeType,
+ "mimeType": mimeType,
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -282,7 +325,7 @@ def extractCsvContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]
"sequenceNr": 1,
"name": "1_csv", # Simplified naming
"ext": "csv",
- "contentType": "text/csv",
+ "mimeType": "text/csv",
"data": csvContent,
"base64Encoded": False,
"metadata": {
@@ -302,7 +345,7 @@ def extractCsvContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]
"sequenceNr": 1,
"name": "1_csv", # Simplified naming
"ext": "csv",
- "contentType": "text/csv",
+ "mimeType": "text/csv",
"data": csvContent,
"base64Encoded": False,
"metadata": {
@@ -319,7 +362,7 @@ def extractCsvContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]
"sequenceNr": 1,
"name": "1_binary", # Simplified naming
"ext": "csv",
- "contentType": "text/csv",
+ "mimeType": "text/csv",
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -332,7 +375,7 @@ def extractCsvContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]
"sequenceNr": 1,
"name": "1_binary", # Simplified naming
"ext": "csv",
- "contentType": "text/csv",
+ "mimeType": "text/csv",
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -364,7 +407,7 @@ def extractSvgContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]
"sequenceNr": 1,
"name": "1_svg", # Simplified naming
"ext": "svg",
- "contentType": "image/svg+xml",
+ "mimeType": "image/svg+xml",
"data": svgText,
"base64Encoded": False,
"metadata": {
@@ -380,7 +423,7 @@ def extractSvgContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]
"sequenceNr": 1,
"name": "1_text",
"ext": "svg",
- "contentType": "text/plain",
+ "mimeType": "text/plain",
"data": svgText,
"base64Encoded": False,
"metadata": {
@@ -401,7 +444,7 @@ def extractSvgContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]
"sequenceNr": 1,
"name": "1_svg", # Simplified naming
"ext": "svg",
- "contentType": "image/svg+xml",
+ "mimeType": "image/svg+xml",
"data": svgText,
"base64Encoded": False,
"metadata": {
@@ -422,7 +465,7 @@ def extractSvgContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]
"sequenceNr": 1,
"name": "1_binary", # Simplified naming
"ext": "svg",
- "contentType": "image/svg+xml",
+ "mimeType": "image/svg+xml",
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -438,7 +481,7 @@ def extractSvgContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]
"sequenceNr": 1,
"name": "1_binary", # Simplified naming
"ext": "svg",
- "contentType": "image/svg+xml",
+ "mimeType": "image/svg+xml",
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -519,7 +562,7 @@ def extractImageContent(fileName: str, fileContent: bytes, mimeType: str) -> Lis
"sequenceNr": 1,
"name": "1_image", # Simplified naming
"ext": fileExtension,
- "contentType": mimeType,
+ "mimeType": mimeType,
"data": encoded_data,
"base64Encoded": True,
"metadata": imageMetadata
@@ -531,7 +574,7 @@ def extractImageContent(fileName: str, fileContent: bytes, mimeType: str) -> Lis
"sequenceNr": 2,
"name": "2_text_image_info", # Simplified naming with label
"ext": "txt",
- "contentType": "text/plain",
+ "mimeType": "text/plain",
"data": imageDescription,
"base64Encoded": False,
"metadata": {
@@ -566,7 +609,7 @@ def extractPdfContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]
"sequenceNr": 1,
"name": "1_pdf", # Simplified naming
"ext": "pdf",
- "contentType": "application/pdf",
+ "mimeType": "application/pdf",
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -604,7 +647,7 @@ def extractPdfContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]
"sequenceNr": len(contents) + 1,
"name": f"{len(contents) + 1}_text", # Simplified naming
"ext": "txt",
- "contentType": "text/plain",
+ "mimeType": "text/plain",
"data": extractedText,
"base64Encoded": False,
"metadata": {
@@ -639,7 +682,7 @@ def extractPdfContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]
"sequenceNr": len(contents) + 1,
"name": f"{len(contents) + 1}_image_page{pageNum+1}_{imgIndex+1}", # Simplified naming with label
"ext": imageExt,
- "contentType": f"image/{imageExt}",
+ "mimeType": f"image/{imageExt}",
"data": base64.b64encode(imageBytes).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -667,7 +710,7 @@ def extractPdfContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]
"sequenceNr": 1,
"name": "1_pdf", # Simplified naming
"ext": "pdf",
- "contentType": "application/pdf",
+ "mimeType": "application/pdf",
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -706,7 +749,7 @@ def extractWordContent(fileName: str, fileContent: bytes, mimeType: str) -> List
"sequenceNr": 1,
"name": "1_word", # Simplified naming
"ext": fileExtension,
- "contentType": mimeType,
+ "mimeType": mimeType,
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -743,7 +786,7 @@ def extractWordContent(fileName: str, fileContent: bytes, mimeType: str) -> List
"sequenceNr": 1,
"name": "1_text", # Simplified naming
"ext": "txt",
- "contentType": "text/plain",
+ "mimeType": "text/plain",
"data": extractedText,
"base64Encoded": False,
"metadata": {
@@ -765,7 +808,7 @@ def extractWordContent(fileName: str, fileContent: bytes, mimeType: str) -> List
"sequenceNr": 1,
"name": "1_word", # Simplified naming
"ext": fileExtension,
- "contentType": mimeType,
+ "mimeType": mimeType,
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -804,7 +847,7 @@ def extractExcelContent(fileName: str, fileContent: bytes, mimeType: str) -> Lis
"sequenceNr": 1,
"name": "1_excel", # Simplified naming
"ext": fileExtension,
- "contentType": mimeType,
+ "mimeType": mimeType,
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -845,7 +888,7 @@ def extractExcelContent(fileName: str, fileContent: bytes, mimeType: str) -> Lis
"sequenceNr": len(contents) + 1,
"name": f"{len(contents) + 1}_csv_{sheetSafeName}", # Simplified naming with sheet label
"ext": "csv",
- "contentType": "text/csv",
+ "mimeType": "text/csv",
"data": csvContent,
"base64Encoded": False,
"metadata": {
@@ -867,7 +910,7 @@ def extractExcelContent(fileName: str, fileContent: bytes, mimeType: str) -> Lis
"sequenceNr": 1,
"name": "1_excel", # Simplified naming
"ext": fileExtension,
- "contentType": mimeType,
+ "mimeType": mimeType,
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -897,7 +940,7 @@ def extractPowerpointContent(fileName: str, fileContent: bytes, mimeType: str) -
"sequenceNr": 1,
"name": "1_powerpoint", # Simplified naming
"ext": fileExtension,
- "contentType": mimeType,
+ "mimeType": mimeType,
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
@@ -923,11 +966,165 @@ def extractBinaryContent(fileName: str, fileContent: bytes, mimeType: str) -> Li
"sequenceNr": 1,
"name": "1_binary", # Simplified naming
"ext": fileExtension,
- "contentType": mimeType,
+ "mimeType": mimeType,
"data": base64.b64encode(fileContent).decode('utf-8'),
"base64Encoded": True,
"metadata": {
"isText": False,
"format": "binary"
}
- }]
\ No newline at end of file
+ }]
+
+def processFile(self, fileContent: bytes, fileName: str, fileMetadata: Dict[str, Any] = None) -> List[Dict[str, Any]]:
+ """
+ Process a file and return its contents as a list of documents.
+
+ Args:
+ fileContent: Binary content of the file
+ fileName: Name of the file
+ fileMetadata: Optional metadata about the file
+
+ Returns:
+ List of document dictionaries
+ """
+ try:
+ # Get file extension and MIME type
+ fileExtension = os.path.splitext(fileName)[1].lower()[1:]
+ mimeType = fileMetadata.get("mimeType", self.mydom.getMimeType(fileName)) if fileMetadata else self.mydom.getMimeType(fileName)
+
+ # Process based on file type
+ if mimeType.startswith("image/"):
+ return self._processImageFile(fileContent, fileName, fileExtension, mimeType, fileMetadata)
+ elif mimeType == "application/pdf":
+ return self._processPdfFile(fileContent, fileName, fileMetadata)
+ elif mimeType == "text/csv":
+ return self._processCsvFile(fileContent, fileName, fileMetadata)
+ elif mimeType == "text/plain":
+ return self._processTextFile(fileContent, fileName, fileMetadata)
+ else:
+ # Default binary file handling
+ return [{
+ "name": fileName,
+ "ext": fileExtension,
+ "mimeType": mimeType,
+ "data": base64.b64encode(fileContent).decode('utf-8'),
+ "base64Encoded": True,
+ "metadata": {
+ "isText": False
+ }
+ }]
+
+ except Exception as e:
+ logger.error(f"Error processing file {fileName}: {str(e)}")
+ raise FileProcessingError(f"Error processing file: {str(e)}")
+
+ def _processImageFile(self, fileContent: bytes, fileName: str, fileExtension: str, mimeType: str, fileMetadata: Dict[str, Any] = None) -> List[Dict[str, Any]]:
+ """Process an image file."""
+ try:
+ # Create image document
+ imageDoc = {
+ "name": fileName,
+ "ext": fileExtension,
+ "mimeType": mimeType,
+ "data": base64.b64encode(fileContent).decode('utf-8'),
+ "base64Encoded": True,
+ "metadata": {
+ "isText": False,
+ "isImage": True,
+ "format": fileExtension
+ }
+ }
+
+ # Add image description if available
+ if fileMetadata and "description" in fileMetadata:
+ imageDoc["metadata"]["description"] = fileMetadata["description"]
+
+ return [imageDoc]
+
+ except Exception as e:
+ logger.error(f"Error processing image file {fileName}: {str(e)}")
+ raise FileProcessingError(f"Error processing image file: {str(e)}")
+
+ def _processPdfFile(self, fileContent: bytes, fileName: str, fileMetadata: Dict[str, Any] = None) -> List[Dict[str, Any]]:
+ """Process a PDF file."""
+ try:
+ # Create PDF document
+ pdfDoc = {
+ "name": fileName,
+ "ext": "pdf",
+ "mimeType": "application/pdf",
+ "data": base64.b64encode(fileContent).decode('utf-8'),
+ "base64Encoded": True,
+ "metadata": {
+ "isText": False,
+ "isPdf": True
+ }
+ }
+
+ return [pdfDoc]
+
+ except Exception as e:
+ logger.error(f"Error processing PDF file {fileName}: {str(e)}")
+ raise FileProcessingError(f"Error processing PDF file: {str(e)}")
+
+ def _processCsvFile(self, fileContent: bytes, fileName: str, fileMetadata: Dict[str, Any] = None) -> List[Dict[str, Any]]:
+ """Process a CSV file."""
+ try:
+ # Try to decode as text first
+ try:
+ csvContent = fileContent.decode('utf-8')
+ base64Encoded = False
+ except UnicodeDecodeError:
+ # If not valid UTF-8, encode as base64
+ csvContent = base64.b64encode(fileContent).decode('utf-8')
+ base64Encoded = True
+
+ # Create CSV document
+ csvDoc = {
+ "name": fileName,
+ "ext": "csv",
+ "mimeType": "text/csv",
+ "data": csvContent,
+ "base64Encoded": base64Encoded,
+ "metadata": {
+ "isText": True,
+ "isCsv": True,
+ "format": "csv"
+ }
+ }
+
+ return [csvDoc]
+
+ except Exception as e:
+ logger.error(f"Error processing CSV file {fileName}: {str(e)}")
+ raise FileProcessingError(f"Error processing CSV file: {str(e)}")
+
+ def _processTextFile(self, fileContent: bytes, fileName: str, fileMetadata: Dict[str, Any] = None) -> List[Dict[str, Any]]:
+ """Process a text file."""
+ try:
+ # Try to decode as text
+ try:
+ textContent = fileContent.decode('utf-8')
+ base64Encoded = False
+ except UnicodeDecodeError:
+ # If not valid UTF-8, encode as base64
+ textContent = base64.b64encode(fileContent).decode('utf-8')
+ base64Encoded = True
+
+ # Create text document
+ textDoc = {
+ "name": fileName,
+ "ext": "txt",
+ "mimeType": "text/plain",
+ "data": textContent,
+ "base64Encoded": base64Encoded,
+ "metadata": {
+ "isText": True
+ }
+ }
+
+ return [textDoc]
+
+ except Exception as e:
+ logger.error(f"Error processing text file {fileName}: {str(e)}")
+ raise FileProcessingError(f"Error processing text file: {str(e)}")
\ No newline at end of file
diff --git a/modules/lucydomInterface.py b/modules/lucydomInterface.py
index a2106d6d..b09629d5 100644
--- a/modules/lucydomInterface.py
+++ b/modules/lucydomInterface.py
@@ -358,11 +358,14 @@ class LucyDOMInterface:
return hashlib.sha256(fileContent).hexdigest()
def checkForDuplicateFile(self, fileHash: str) -> Optional[Dict[str, Any]]:
- """Checks if a file with the same hash already exists."""
- files = self.db.getRecordset("files", recordFilter={"fileHash": fileHash})
- filteredFiles = self._uam("files", files)
- if filteredFiles:
- return filteredFiles[0]
+ """Checks if a file with the same hash already exists for the current user and mandate."""
+ files = self.db.getRecordset("files", recordFilter={
+ "fileHash": fileHash,
+ "mandateId": self.mandateId,
+ "userId": self.userId
+ })
+ if files:
+ return files[0]
return None
def getMimeType(self, filename: str) -> str:
@@ -670,7 +673,7 @@ class LucyDOMInterface:
fileHash = self.calculateFileHash(fileContent)
logger.debug(f"Calculated file hash: {fileHash}")
- # Check for duplicate
+ # Check for duplicate within same user/mandate
existingFile = self.checkForDuplicateFile(fileHash)
if existingFile:
logger.info(f"Duplicate found for {fileName}: {existingFile['id']}")
@@ -692,9 +695,6 @@ class LucyDOMInterface:
# Save binary data
logger.info(f"Saving file content to database for file: {fileName}")
self.createFileData(dbFile["id"], fileContent)
-
- # Debug: Export file to static folder
- self._exportFileToStatic(fileContent, dbFile["id"], fileName)
logger.info(f"File upload process completed for: {fileName}")
return dbFile
@@ -731,12 +731,6 @@ class LucyDOMInterface:
logger.error(f"Error downloading file {fileId}: {str(e)}")
raise FileError(f"Error downloading file: {str(e)}")
- def _exportFileToStatic(self, fileContent: bytes, fileId: int, fileName: str):
- """Debug helper to export files to static folder."""
- debugFilename = f"{fileId}_{fileName}"
- with open(f"./static/{debugFilename}", 'wb') as f:
- f.write(fileContent)
-
# Workflow methods
def getAllWorkflows(self) -> List[Dict[str, Any]]:
diff --git a/modules/lucydomModel.py b/modules/lucydomModel.py
index c14ab9f3..782d0b0e 100644
--- a/modules/lucydomModel.py
+++ b/modules/lucydomModel.py
@@ -110,7 +110,7 @@ class DocumentContent(BaseModel):
sequenceNr: int = Field(1, description="Sequence number of the content in the source document")
name: str = Field(description="Designation")
ext: str = Field(description="Content extension for export: txt, csv, json, jpg, png")
- contentType: str = Field(description="MIME type")
+ mimeType: str = Field(description="MIME type")
summary: str = Field(description="Summary of the file content")
data: str = Field(description="Actual content, text or base64 encoded based on base64Encoded flag")
base64Encoded: bool = Field(description="Flag indicating whether the data is base64 encoded")
@@ -122,6 +122,7 @@ class Document(BaseModel):
name: str = Field(description="Name of the data object")
ext: str = Field(description="Extension of the data object")
fileId: int = Field(description="ID of the referenced file in the database")
+ mimeType: str = Field(description="MIME type")
data: str = Field(description="Content of the data as text or base64 encoded based on base64Encoded flag")
base64Encoded: bool = Field(description="Flag indicating whether the data is base64 encoded")
contents: List[DocumentContent] = Field(description="Document contents")
diff --git a/modules/workflowAgentsRegistry.py b/modules/workflowAgentsRegistry.py
index 25d8d2ff..0847442b 100644
--- a/modules/workflowAgentsRegistry.py
+++ b/modules/workflowAgentsRegistry.py
@@ -85,51 +85,45 @@ class AgentBase:
"""Wrapper for the utility function"""
return isTextMimeType(mimeType)
- def formatAgentDocumentOutput(self, label: str, content: Any, contentType: str = None) -> Dict[str, Any]:
+ def formatAgentDocumentOutput(self, label: str, content: Any, mimeType: str = None) -> Dict[str, Any]:
"""
- Helper method to properly format a document output with base64Encoded flag and metadata.
+ Format agent output as a document.
Args:
- label: Name of the document
+ label: Label for the document
content: Content of the document
- contentType: Optional content type for the document
-
- Returns:
- Properly formatted document dictionary
+ mimeType: Optional MIME type for the document
"""
- import base64
-
- # Determine if content should be base64 encoded
- should_base64_encode = self.determineBase64EncodingFlag(label, content)
-
- # Process content based on type and encoding flag
- formatted_content = content
-
- if should_base64_encode:
- if isinstance(content, bytes):
- # Convert binary to base64
- formatted_content = base64.b64encode(content).decode('utf-8')
- elif isinstance(content, str):
- try:
- # Check if it's already base64 encoded
- base64.b64decode(content)
- # If we get here, it appears to be valid base64
- formatted_content = content
- except:
- # Not valid base64, so encode it
- formatted_content = base64.b64encode(content.encode('utf-8')).decode('utf-8')
-
- # Create document with metadata
+ # Create document structure
doc = {
- "label": label,
- "content": formatted_content,
- "base64Encoded": should_base64_encode,
- "metadata": {}
+ "id": str(uuid.uuid4()),
+ "name": label,
+ "ext": "txt", # Default extension
+ "data": content,
+ "base64Encoded": False,
+ "metadata": {
+ "isText": True
+ }
}
- # Add content type if provided
- if contentType:
- doc["metadata"]["contentType"] = contentType
+ # Set MIME type if provided
+ if mimeType:
+ doc["mimeType"] = mimeType
+ # Update extension based on MIME type
+ if mimeType == "text/markdown":
+ doc["ext"] = "md"
+ elif mimeType == "text/html":
+ doc["ext"] = "html"
+ elif mimeType == "text/csv":
+ doc["ext"] = "csv"
+ elif mimeType == "application/json":
+ doc["ext"] = "json"
+ elif mimeType.startswith("image/"):
+ doc["ext"] = mimeType.split("/")[1]
+ doc["metadata"]["isText"] = False
+ elif mimeType == "application/pdf":
+ doc["ext"] = "pdf"
+ doc["metadata"]["isText"] = False
return doc
diff --git a/modules/workflowManager.py b/modules/workflowManager.py
index 5d89d311..c4a6b520 100644
--- a/modules/workflowManager.py
+++ b/modules/workflowManager.py
@@ -10,7 +10,7 @@ import json
import re
import uuid
import base64
-from datetime import datetime
+from datetime import datetime, timedelta
from typing import Dict, Any, List, Optional, Union, Tuple
from modules.mimeUtils import isTextMimeType, determineContentEncoding
@@ -382,6 +382,7 @@ Please analyze the request and create:
3. Do not define document inputs that don't exist or haven't been generated beforehand.
4. Create a logical sequence - earlier agents can create documents that are later used as inputs.
5. If the user has provided documents but hasn't clearly stated what they want, try to act according to the context.
+6. ALL documents provided by the user (where fileSource is "user") MUST be included in the work plan, even if they don't have content summaries or if content extraction failed.
Your answer must be strictly in the JSON_OUTPUT format, with no additions before or after the JSON object.
@@ -415,6 +416,7 @@ JSON_OUTPUT = {{
## RULES for inputDocuments:
1. The user request refers to documents where "fileSource" in available documents is "user". Those documents are in the focus for input
2. In case of redundant label in available documents, use document with highest sequenceNr if not specified differently
+3. ALL documents provided by the user MUST be included in the work plan, even if they don't have content summaries or if content extraction failed
## STRICT RULES FOR document "label":
1. Every document label MUST include a proper file extension that matches the content type.
@@ -789,8 +791,13 @@ filesDelivered = {self.parseJson2text(matchingDocuments)}
"fileId": fileId,
"name": os.path.splitext(fileNameExt)[0] if os.path.splitext(fileNameExt)[0] else "noname",
"ext": os.path.splitext(fileNameExt)[1][1:] if os.path.splitext(fileNameExt)[1] else "bin",
+ "mimeType": mimeType,
"data": encodedData,
"base64Encoded": base64Encoded,
+ "metadata": {
+ "isText": isTextFormat,
+ "base64Encoded": base64Encoded # For backward compatibility
+ },
"contents": []
}
@@ -799,7 +806,7 @@ filesDelivered = {self.parseJson2text(matchingDocuments)}
# Add summaries to each content item
for content in contents:
- content["summary"] = await self.messageSummarizeContent(content)
+ content["summary"] = await self.getContentExtraction(content)
# Ensure base64Encoded flag is set
if "base64Encoded" not in content:
@@ -861,92 +868,87 @@ filesDelivered = {self.parseJson2text(matchingDocuments)}
return preparedInputs
-
- async def messageSummarizeContent(self, content: Dict[str, Any]) -> str:
- return await self.getContentExtraction(
- content,
- "Create a very concise summary (1-2 sentences, maximum 200 characters) about this content."
- )
-
async def processDocumentForAgent(self, document: Dict[str, Any], docSpec: Dict[str, Any]) -> Dict[str, Any]:
- """
- Processes a document for an agent based on the document specification.
- Uses AI to extract relevant content from the document based on the specification.
-
- Args:
- document: The document to process
- docSpec: The document specification from the project manager
+ """
+ Processes a document for an agent based on the document specification.
+ Uses AI to extract relevant content from the document based on the specification.
- Returns:
- Processed document with AI-extracted content
- """
- processedDoc = document.copy()
- partSpec = docSpec.get("contentPart", "")
-
- # Process each content item in the document
- if "contents" in processedDoc:
- processedContents = []
+ Args:
+ document: The document to process
+ docSpec: The document specification from the project manager
+
+ Returns:
+ Processed document with AI-extracted content
+ """
+ processedDoc = document.copy()
+ partSpec = docSpec.get("contentPart", "")
- for content in processedDoc["contents"]:
- # Check if part required
- if partSpec != "" and partSpec != content.get("name"):
- continue
+ # Process each content item in the document
+ if "contents" in processedDoc:
+ processedContents = []
+
+ for content in processedDoc["contents"]:
+ # Check if part required
+ if partSpec != "" and partSpec != content.get("name"):
+ continue
- # Get the prompt from the document specification
- summary = docSpec.get("prompt", "Extract the relevant information from this document")
+ # Get the prompt from the document specification
+ summary = docSpec.get("prompt", "Extract the relevant information from this document")
+
+ # Process content using the shared helper function
+ processedContent = content.copy()
+ processedContent["dataExtracted"] = await self.getContentExtraction(content, summary)
+ processedContent["metadata"]["aiProcessed"] = True
+
+ processedContents.append(processedContent)
- # Process content using the shared helper function
- processedContent = content.copy()
- processedContent["dataExtracted"] = await self.getContentExtraction(content, summary)
- processedContent["metadata"]["aiProcessed"] = True
-
- processedContents.append(processedContent)
+ processedDoc["contents"] = processedContents
- processedDoc["contents"] = processedContents
-
- return processedDoc
+ return processedDoc
async def getContentExtraction(self, content: Dict[str, Any], prompt: str = None) -> str:
"""
- Helper function that extracts or summarizes content based on its type (text/image/binary).
+ Helper function that extracts or summarizes content based on its encoding.
+ For base64 encoded content, uses callAi4Image. For non-base64 content, uses callAi.
Args:
content: Content item to analyze
- prompt: Optional custom prompt for extraction (default prompts used if not provided)
+ prompt: Custom prompt for extraction (default prompts used if not provided)
Returns:
Extracted or summarized content as text
"""
- # Extract relevant information
- data = content.get("data", "")
- contentType = content.get("contentType", "text/plain")
- base64Encoded = content.get("base64Encoded", False)
-
- # Default prompts if none provided
- if prompt is None:
- text_prompt = "Create a very concise summary (1-2 sentences, maximum 200 characters) about this content."
- image_prompt = "Create a very concise summary (1-2 sentences, maximum 200 characters) about this image."
- else:
- text_prompt = prompt
- image_prompt = prompt
-
try:
- # For image content, use the specialized image analysis
- if base64Encoded:
- return await self.mydom.callAi4Image(data, contentType, image_prompt)
-
- # For text data, use the regular AI processing
- else:
- return await self.mydom.callAi([
- {"role": "system", "content": "You are a content analyzer. Process the provided content as instructed."},
- {"role": "user", "content": f"{text_prompt}\n\n{data}"}
- ])
+ # Get content data and encoding status
+ data = content.get("data", "")
+ isBase64 = content.get("base64Encoded", False)
+ # Default prompts if none provided
+ if prompt is None:
+ textPrompt = "Create a very concise summary (1-2 sentences, maximum 200 characters) about this content."
+ imagePrompt = "Create a very concise summary (1-2 sentences, maximum 200 characters) about this image."
+ else:
+ textPrompt = prompt
+ imagePrompt = prompt
+
+ # Handle base64 encoded content
+ if isBase64:
+ try:
+ # Pass base64 encoded data directly to callAi4Image
+ return await self.mydom.callAi4Image(data, content.get("mimeType", "application/octet-stream"), imagePrompt)
+ except Exception as e:
+ logger.error(f"Error processing base64 content: {str(e)}")
+ return f"Error processing content: {str(e)}"
+ else:
+ # For non-base64 content, use callAi
+ return await self.mydom.callAi([
+ {"role": "system", "content": "You are a content analyzer. Extract relevant information from the provided content."},
+ {"role": "user", "content": f"{textPrompt}\n\nContent:\n{data}"}
+ ], produceUserAnswer=True)
+
except Exception as e:
logger.error(f"Error processing content: {str(e)}")
- return f"Content of type {contentType} (processing failed)"
-
-
+ return f"Error processing content: {str(e)}"
def messageAdd(self, workflow: Dict[str, Any], message: Dict[str, Any]) -> Dict[str, Any]:
"""
@@ -1086,56 +1088,69 @@ filesDelivered = {self.parseJson2text(matchingDocuments)}
List of file IDs for the saved documents
"""
fileIds = []
+ used_names = set() # Track used names to prevent duplicates
# Extract documents from agent results
documents = agentResults.get("documents", [])
for doc in documents:
try:
- # Extract label (filename) and content
- label = doc.get("label", "unnamed_file.txt")
- content = doc.get("content", "")
+ # Extract document data according to LucyDOM model
+ name = doc.get("name", "")
+ ext = doc.get("ext", "")
+ data = doc.get("data", "")
base64Encoded = doc.get("base64Encoded", False)
- # Split label into name and extension
- name, ext = os.path.splitext(label)
- if ext.startswith('.'):
- ext = ext[1:] # Remove leading dot
- elif not ext:
- # If no extension is provided, default to .txt for text content
- ext = "txt"
- label = f"{label}.{ext}"
+ # Skip if no name or data
+ if not name or not data:
+ logger.warning(f"Skipping document with missing name or data. Name: {name}, Has data: {bool(data)}")
+ continue
+
+ # Ensure unique filename
+ base_name = name
+ counter = 1
+ while f"{base_name}.{ext}" in used_names:
+ base_name = f"{name}_{counter}"
+ counter += 1
+ used_names.add(f"{base_name}.{ext}")
# Convert content to bytes based on base64Encoded flag
- if isinstance(content, str):
+ if isinstance(data, str):
if base64Encoded:
# Decode base64 to bytes
try:
import base64
- fileContent = base64.b64decode(content)
+ fileContent = base64.b64decode(data)
except Exception as e:
logger.warning(f"Failed to decode base64 content: {str(e)}")
- fileContent = content.encode('utf-8')
+ fileContent = data.encode('utf-8')
base64Encoded = False
else:
# Convert text to bytes
- fileContent = content.encode('utf-8')
+ fileContent = data.encode('utf-8')
else:
# Already bytes
- fileContent = content
+ fileContent = data
# Determine MIME type based on extension
- mimeType = self.mydom.getMimeType(label)
+ mimeType = self.mydom.getMimeType(f"{base_name}.{ext}")
- # Save file to database
- fileMeta = self.mydom.saveUploadedFile(fileContent, label)
+ # Create file metadata
+ fileMeta = self.mydom.createFile(
+ name=base_name,
+ mimeType=mimeType,
+ size=len(fileContent)
+ )
if fileMeta and "id" in fileMeta:
- fileId = fileMeta["id"]
- fileIds.append(fileId)
- logger.info(f"Saved document '{label}' with file ID: {fileId} (base64Encoded: {base64Encoded})")
+ # Save file content
+ if self.mydom.createFileData(fileMeta["id"], fileContent):
+ fileIds.append(fileMeta["id"])
+ logger.info(f"Saved document '{base_name}.{ext}' with file ID: {fileMeta['id']} (base64Encoded: {base64Encoded})")
+ else:
+ logger.warning(f"Failed to save content for document '{base_name}.{ext}'")
else:
- logger.warning(f"Failed to save document '{label}'")
+ logger.warning(f"Failed to create file metadata for '{base_name}.{ext}'")
except Exception as e:
logger.error(f"Error saving document from agent results: {str(e)}")
@@ -1174,11 +1189,19 @@ filesDelivered = {self.parseJson2text(matchingDocuments)}
# Extract summaries from all contents
contentSummaries = []
- for content in doc.get("contents", []):
+ if "contents" in doc and doc["contents"]:
+ for content in doc["contents"]:
+ contentSummaries.append({
+ "contentPart": content.get("name", "noname"),
+ "metadata": content.get("metadata", ""),
+ "summary": content.get("summary", "No summary"),
+ })
+ else:
+ # Add a default content summary if no contents exist
contentSummaries.append({
- "contentPart": content.get("name", "noname"),
- "metadata": content.get("metadata", ""),
- "summary": content.get("summary", "No summary"),
+ "contentPart": "1_undefined",
+ "metadata": "",
+ "summary": "No content extracted",
})
# Create document info
@@ -1277,11 +1300,12 @@ filesDelivered = {self.parseJson2text(matchingDocuments)}
# Singleton factory for the WorkflowManager
_workflowManagers = {}
+_workflowManagerLastAccess = {} # Track last access time for cleanup
def getWorkflowManager(mandateId: int = 0, userId: int = 0) -> WorkflowManager:
"""
Returns a WorkflowManager for the specified context.
- Reuses existing instances.
+ Reuses existing instances but implements cleanup for inactive instances.
Args:
mandateId: ID of the mandate
@@ -1291,6 +1315,32 @@ def getWorkflowManager(mandateId: int = 0, userId: int = 0) -> WorkflowManager:
WorkflowManager instance
"""
contextKey = f"{mandateId}_{userId}"
+ current_time = datetime.now()
+
+ # Update last access time
+ _workflowManagerLastAccess[contextKey] = current_time
+
+ # Cleanup old instances (older than 1 hour)
+ cleanup_threshold = current_time - timedelta(hours=1)
+ for key in list(_workflowManagers.keys()):
+ if _workflowManagerLastAccess.get(key, current_time) < cleanup_threshold:
+ del _workflowManagers[key]
+ del _workflowManagerLastAccess[key]
+
if contextKey not in _workflowManagers:
_workflowManagers[contextKey] = WorkflowManager(mandateId, userId)
- return _workflowManagers[contextKey]
\ No newline at end of file
+ return _workflowManagers[contextKey]
+
+def cleanupWorkflowManager(mandateId: int, userId: int) -> None:
+ """
+ Explicitly cleanup a WorkflowManager instance.
+
+ Args:
+ mandateId: ID of the mandate
+ userId: ID of the user
+ """
+ contextKey = f"{mandateId}_{userId}"
+ if contextKey in _workflowManagers:
+ del _workflowManagers[contextKey]
+ if contextKey in _workflowManagerLastAccess:
+ del _workflowManagerLastAccess[contextKey]
\ No newline at end of file
diff --git a/notes/changelog.txt b/notes/changelog.txt
index 72a0aafe..1aeb06a9 100644
--- a/notes/changelog.txt
+++ b/notes/changelog.txt
@@ -1,28 +1,28 @@
....................... TASKS
+
+Check data extraction of types!
+
+final message with 100% to give
+
+
+
----------------------- OPEN
PRIO1:
-CHECK: If pictures not displayed to check utf-8 encoding in the base64 string!! general file writing and reading (example with svg)
-
-add connector to myoutlook
+sharepoint connector with document search, content search, content extraction
PRIO2:
-todo an agent for "code writing and editing" connected to the codebase, working in loops over each document...
-
sharepoint connector with document search, content search, content extraction
Split big files into content-parts
Integrate NDA Text as modal form - Data governance agreement by login with checkbox
-frontend to react
-
-frontend: no labels definition
PRIO3:
diff --git a/static/10_email_preview.html b/static/10_email_preview.html
deleted file mode 100644
index c900e097..00000000
--- a/static/10_email_preview.html
+++ /dev/null
@@ -1,42 +0,0 @@
-
-
-
-
-
- Email Preview: Verschiebung des Meetings auf Freitag
-
-
-
-
-
-
Email Template Preview
-
-
-
-
To:
-
peter.muster@domain.com
-
-
-
Subject:
-
Verschiebung des Meetings auf Freitag
-
-
-
Sehr geehrter Herr Muster,
ich hoffe, es geht Ihnen gut. Ich schreibe Ihnen, um unser geplantes Meeting von 10 Uhr auf Freitag zu verschieben. Bitte lassen Sie mich wissen, ob dieser neue Termin für Sie passt.
Vielen Dank für Ihr Verständnis.
Mit freundlichen Grüßen, [Ihr Name]
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/static/11_email_template.json b/static/11_email_template.json
deleted file mode 100644
index bf14e27b..00000000
--- a/static/11_email_template.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "recipient": "peter.muster@domain.com",
- "subject": "Verschiebung des Meetings auf Freitag",
- "plainBody": "Sehr geehrter Herr Muster,\n\nich hoffe, es geht Ihnen gut. Ich schreibe Ihnen, um unser geplantes Meeting von 10 Uhr auf Freitag zu verschieben. Bitte lassen Sie mich wissen, ob dieser neue Termin f\u00fcr Sie passt.\n\nVielen Dank f\u00fcr Ihr Verst\u00e4ndnis.\n\nMit freundlichen Gr\u00fc\u00dfen,\n\n[Ihr Name]",
- "htmlBody": "
Sehr geehrter Herr Muster,
ich hoffe, es geht Ihnen gut. Ich schreibe Ihnen, um unser geplantes Meeting von 10 Uhr auf Freitag zu verschieben. Bitte lassen Sie mich wissen, ob dieser neue Termin f\u00fcr Sie passt.
Vielen Dank f\u00fcr Ihr Verst\u00e4ndnis.
Mit freundlichen Gr\u00fc\u00dfen, [Ihr Name]
"
-}
\ No newline at end of file
diff --git a/static/12_email_preview.html b/static/12_email_preview.html
deleted file mode 100644
index 87962b01..00000000
--- a/static/12_email_preview.html
+++ /dev/null
@@ -1,42 +0,0 @@
-
-
-
-
-
- Email Preview: Anfrage zur Terminverschiebung
-
-
-
-
-
-
Email Template Preview
-
-
-
-
To:
-
peter.muster@domain.com
-
-
-
Subject:
-
Anfrage zur Terminverschiebung
-
-
-
Sehr geehrter Herr Muster,
ich hoffe, diese Nachricht trifft Sie wohl. Ich schreibe Ihnen, um eine Verschiebung unseres Termins von 10 Uhr auf Freitag zu erbitten. Bitte lassen Sie mich wissen, ob dies für Sie möglich ist.
Vielen Dank im Voraus für Ihre Flexibilität.
Mit freundlichen Grüßen, [Ihr Name]
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/static/13_email_template.json b/static/13_email_template.json
deleted file mode 100644
index ab8a946c..00000000
--- a/static/13_email_template.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "recipient": "peter.muster@domain.com",
- "subject": "Anfrage zur Terminverschiebung",
- "plainBody": "Sehr geehrter Herr Muster,\n\nich hoffe, diese Nachricht trifft Sie wohl. Ich schreibe Ihnen, um eine Verschiebung unseres Termins von 10 Uhr auf Freitag zu erbitten. Bitte lassen Sie mich wissen, ob dies f\u00fcr Sie m\u00f6glich ist.\n\nVielen Dank im Voraus f\u00fcr Ihre Flexibilit\u00e4t.\n\nMit freundlichen Gr\u00fc\u00dfen,\n\n[Ihr Name]",
- "htmlBody": "
Sehr geehrter Herr Muster,
ich hoffe, diese Nachricht trifft Sie wohl. Ich schreibe Ihnen, um eine Verschiebung unseres Termins von 10 Uhr auf Freitag zu erbitten. Bitte lassen Sie mich wissen, ob dies f\u00fcr Sie m\u00f6glich ist.
Vielen Dank im Voraus f\u00fcr Ihre Flexibilit\u00e4t.
Mit freundlichen Gr\u00fc\u00dfen, [Ihr Name]
"
-}
\ No newline at end of file
diff --git a/static/14_microsoft_authentication.html b/static/14_microsoft_authentication.html
deleted file mode 100644
index b8a50d7f..00000000
--- a/static/14_microsoft_authentication.html
+++ /dev/null
@@ -1,47 +0,0 @@
-
-
-
-
-
- Microsoft Authentication Required
-
-
-
-
-
Microsoft Authentication Required
-
-
To create email templates and drafts, you need to authenticate with your Microsoft account. Follow these steps:
- 2
- Sign in with your Microsoft account and grant the required permissions
-
-
-
- 3
- Return to this application and run the email agent again after completing authentication
-
-
-
-
Note: You only need to authenticate once. Your session will be remembered for future email operations.
-
-
-
-
-
\ No newline at end of file
diff --git a/static/15_microsoft_authentication.html b/static/15_microsoft_authentication.html
deleted file mode 100644
index 521bae1c..00000000
--- a/static/15_microsoft_authentication.html
+++ /dev/null
@@ -1,28 +0,0 @@
-
-
-
-
-
- Microsoft Authentication Required
-
-
-
-
-
Microsoft Authentication Required
-
-
To create email templates and drafts, you need to authenticate with your Microsoft account.
-
-
The application will now initiate the Microsoft authentication process. Please follow the instructions in the authentication window.
-
-
-
Note: You only need to authenticate once. Your session will be remembered for future email operations.
-
-
-
-
-
\ No newline at end of file
diff --git a/static/16_email_preview.html b/static/16_email_preview.html
deleted file mode 100644
index 95096bad..00000000
--- a/static/16_email_preview.html
+++ /dev/null
@@ -1,42 +0,0 @@
-
-
-
-
-
- Email Preview: Verschiebung des Meetings auf Freitag
-
-
-
-
-
-
Email Template Preview
-
-
-
-
To:
-
peter.muster@domain.com
-
-
-
Subject:
-
Verschiebung des Meetings auf Freitag
-
-
-
Sehr geehrter Herr Muster,
ich hoffe, es geht Ihnen gut. Ich schreibe Ihnen, um unser geplantes Meeting um 10 Uhr auf Freitag zu verschieben. Bitte lassen Sie mich wissen, ob dieser Termin für Sie passt.
Vielen Dank für Ihr Verständnis.
Mit freundlichen Grüßen,
[Ihr Name]
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/static/17_email_template.json b/static/17_email_template.json
deleted file mode 100644
index 90e8f9f3..00000000
--- a/static/17_email_template.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "recipient": "peter.muster@domain.com",
- "subject": "Verschiebung des Meetings auf Freitag",
- "plainBody": "Sehr geehrter Herr Muster,\n\nich hoffe, es geht Ihnen gut. Ich schreibe Ihnen, um unser geplantes Meeting um 10 Uhr auf Freitag zu verschieben. Bitte lassen Sie mich wissen, ob dieser Termin f\u00fcr Sie passt.\n\nVielen Dank f\u00fcr Ihr Verst\u00e4ndnis.\n\nMit freundlichen Gr\u00fc\u00dfen,\n\n[Ihr Name]",
- "htmlBody": "
Sehr geehrter Herr Muster,
ich hoffe, es geht Ihnen gut. Ich schreibe Ihnen, um unser geplantes Meeting um 10 Uhr auf Freitag zu verschieben. Bitte lassen Sie mich wissen, ob dieser Termin f\u00fcr Sie passt.
anbei finden Sie die Datei 'prime_numbers.csv', die die Liste der Primzahlen enthält.
Mit freundlichen Grüßen, Ihr Team
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/static/22_email_template.json b/static/22_email_template.json
deleted file mode 100644
index ea8be43b..00000000
--- a/static/22_email_template.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "recipient": "recipient@example.com",
- "subject": "Prime Numbers CSV",
- "plainBody": "Sehr geehrte Damen und Herren,\n\nanbei finden Sie die Datei 'prime_numbers.csv', die die Liste der Primzahlen enth\u00e4lt.\n\nMit freundlichen Gr\u00fc\u00dfen,\nIhr Team",
- "htmlBody": "
Sehr geehrte Damen und Herren,
anbei finden Sie die Datei 'prime_numbers.csv', die die Liste der Primzahlen enth\u00e4lt.
Mit freundlichen Gr\u00fc\u00dfen, Ihr Team
"
-}
\ No newline at end of file
diff --git a/static/23_documentProcessor.py b/static/23_documentProcessor.py
deleted file mode 100644
index d3b637e1..00000000
--- a/static/23_documentProcessor.py
+++ /dev/null
@@ -1,933 +0,0 @@
-"""
-Module for extracting content from various file formats.
-Provides specialized functions for processing text, PDF, Office documents, images, etc.
-"""
-
-import logging
-import os
-import io
-from typing import Dict, Any, List, Optional, Union, Tuple
-import base64
-
-# Configure logger
-logger = logging.getLogger(__name__)
-
-# Optional imports - only loaded when needed
-pdfExtractorLoaded = False
-officeExtractorLoaded = False
-imageProcessorLoaded = False
-
-def getDocumentContents(fileMetadata: Dict[str, Any], fileContent: bytes) -> List[Dict[str, Any]]:
- """
- Main function for extracting content from a file based on its MIME type.
- Delegates to specialized extraction functions.
-
- Args:
- fileMetadata: File metadata (Name, MIME type, etc.)
- fileContent: Binary data of the file
-
- Returns:
- List of Document-Content objects with metadata and base64Encoded flag
- """
- try:
- mimeType = fileMetadata.get("mimeType", "application/octet-stream")
- fileName = fileMetadata.get("name", "unknown")
-
- logger.info(f"Extracting content from file '{fileName}' (MIME type: {mimeType})")
-
- # Extract content based on MIME type
- contents = []
-
- # Text-based formats (excluding CSV which has its own handler)
- if mimeType == "text/csv":
- contents.extend(extractCsvContent(fileName, fileContent))
-
- # Then handle other text-based formats
- elif mimeType.startswith("text/") or mimeType in [
- "application/json",
- "application/xml",
- "application/javascript",
- "application/x-python"
- ]:
- contents.extend(extractTextContent(fileName, fileContent, mimeType))
-
- # SVG Files
- elif mimeType == "image/svg+xml":
- contents.extend(extractSvgContent(fileName, fileContent))
-
- # Images
- elif mimeType.startswith("image/"):
- contents.extend(extractImageContent(fileName, fileContent, mimeType))
-
- # PDF Documents
- elif mimeType == "application/pdf":
- contents.extend(extractPdfContent(fileName, fileContent))
-
- # Word Documents
- elif mimeType in [
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- "application/msword"
- ]:
- contents.extend(extractWordContent(fileName, fileContent, mimeType))
-
- # Excel Documents
- elif mimeType in [
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
- "application/vnd.ms-excel"
- ]:
- contents.extend(extractExcelContent(fileName, fileContent, mimeType))
-
- # PowerPoint Documents
- elif mimeType in [
- "application/vnd.openxmlformats-officedocument.presentationml.presentation",
- "application/vnd.ms-powerpoint"
- ]:
- contents.extend(extractPowerpointContent(fileName, fileContent, mimeType))
-
- # Binary data as fallback for unknown formats
- else:
- contents.extend(extractBinaryContent(fileName, fileContent, mimeType))
-
- # Fallback when no content could be extracted
- if not contents:
- logger.warning(f"No content extracted from file '{fileName}', using binary fallback")
-
- # Convert binary content to base64
- encoded_data = base64.b64encode(fileContent).decode('utf-8')
-
- contents.append({
- "sequenceNr": 1,
- "name": '1_undefined',
- "ext": os.path.splitext(fileName)[1][1:] if os.path.splitext(fileName)[1] else "bin",
- "contentType": mimeType,
- "data": encoded_data,
- "base64Encoded": True,
- "metadata": {
- "isText": False
- }
- })
-
- # Add generic attributes for all documents
- for content in contents:
- # Make sure all content items have the base64Encoded flag
- if "base64Encoded" not in content:
- if isinstance(content.get("data"), bytes):
- # Convert bytes to base64
- content["data"] = base64.b64encode(content["data"]).decode('utf-8')
- content["base64Encoded"] = True
- else:
- # Assume text content if not explicitly marked
- content["base64Encoded"] = False
-
- # Maintain backward compatibility with old "base64Encoded" flag in metadata
- if "metadata" not in content:
- content["metadata"] = {}
-
- # Set base64Encoded in metadata for backward compatibility
- content["metadata"]["base64Encoded"] = content["base64Encoded"]
-
- logger.info(f"Successfully extracted {len(contents)} content items from file '{fileName}'")
- return contents
-
- except Exception as e:
- logger.error(f"Error during content extraction: {str(e)}")
- # Fallback on error - return original data
- return [{
- "sequenceNr": 1,
- "name": fileMetadata.get("name", "unknown"),
- "ext": os.path.splitext(fileMetadata.get("name", ""))[1][1:] if os.path.splitext(fileMetadata.get("name", ""))[1] else "bin",
- "contentType": fileMetadata.get("mimeType", "application/octet-stream"),
- "data": base64.b64encode(fileContent).decode('utf-8'),
- "base64Encoded": True,
- "metadata": {
- "isText": False,
- "base64Encoded": True # For backward compatibility
- }
- }]
-
-
-def _loadPdfExtractor():
- """Loads PDF extraction libraries when needed"""
- global pdfExtractorLoaded
- if not pdfExtractorLoaded:
- try:
- global PyPDF2, fitz
- import PyPDF2
- import fitz # PyMuPDF for more extensive PDF processing
- pdfExtractorLoaded = True
- logger.info("PDF extraction libraries successfully loaded")
- except ImportError as e:
- logger.warning(f"PDF extraction libraries could not be loaded: {e}")
-
-def _loadOfficeExtractor():
- """Loads Office document extraction libraries when needed"""
- global officeExtractorLoaded
- if not officeExtractorLoaded:
- try:
- global docx, openpyxl
- import docx # python-docx for Word documents
- import openpyxl # for Excel files
- officeExtractorLoaded = True
- logger.info("Office extraction libraries successfully loaded")
- except ImportError as e:
- logger.warning(f"Office extraction libraries could not be loaded: {e}")
-
-def _loadImageProcessor():
- """Loads image processing libraries when needed"""
- global imageProcessorLoaded
- if not imageProcessorLoaded:
- try:
- global PIL, Image
- from PIL import Image
- imageProcessorLoaded = True
- logger.info("Image processing libraries successfully loaded")
- except ImportError as e:
- logger.warning(f"Image processing libraries could not be loaded: {e}")
-
-def extractTextContent(fileName: str, fileContent: bytes, mimeType: str) -> List[Dict[str, Any]]:
- """
- Extracts text from text files.
-
- Args:
- fileName: Name of the file
- fileContent: Binary data of the file
- mimeType: MIME type of the file
-
- Returns:
- List of Text-Content objects with base64Encoded = False
- """
- try:
- # Keep original file extension
- fileExtension = os.path.splitext(fileName)[1][1:] if os.path.splitext(fileName)[1] else "txt"
-
- # Extract text content
- textContent = fileContent.decode('utf-8')
- return [{
- "sequenceNr": 1,
- "name": "1_text", # Simplified naming
- "ext": fileExtension,
- "contentType": "text/plain",
- "data": textContent,
- "base64Encoded": False,
- "metadata": {
- "isText": True
- }
- }]
- except UnicodeDecodeError:
- logger.warning(f"Could not decode text from file '{fileName}' as UTF-8, trying alternative encodings")
- try:
- # Try alternative encodings
- for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
- try:
- textContent = fileContent.decode(encoding)
- logger.info(f"Text successfully decoded with encoding {encoding}")
- return [{
- "sequenceNr": 1,
- "name": "1_text", # Simplified naming
- "ext": fileExtension,
- "contentType": "text/plain",
- "data": textContent,
- "base64Encoded": False,
- "metadata": {
- "isText": True,
- "encoding": encoding
- }
- }]
- except UnicodeDecodeError:
- continue
-
- # Fallback to binary data if no encoding works
- logger.warning(f"Could not decode text, using binary data")
- return [{
- "sequenceNr": 1,
- "name": "1_binary", # Simplified naming
- "ext": fileExtension,
- "contentType": mimeType,
- "data": base64.b64encode(fileContent).decode('utf-8'),
- "base64Encoded": True,
- "metadata": {
- "isText": False
- }
- }]
- except Exception as e:
- logger.error(f"Error in alternative text decoding: {str(e)}")
- # Return binary data as fallback
- return [{
- "sequenceNr": 1,
- "name": "1_binary", # Simplified naming
- "ext": fileExtension,
- "contentType": mimeType,
- "data": base64.b64encode(fileContent).decode('utf-8'),
- "base64Encoded": True,
- "metadata": {
- "isText": False
- }
- }]
-
-def extractCsvContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]:
- """
- Extracts content from CSV files.
-
- Args:
- fileName: Name of the file
- fileContent: Binary data of the file
-
- Returns:
- List of CSV-Content objects with base64Encoded = False
- """
- try:
- # Extract text content
- csvContent = fileContent.decode('utf-8')
- return [{
- "sequenceNr": 1,
- "name": "1_csv", # Simplified naming
- "ext": "csv",
- "contentType": "text/csv",
- "data": csvContent,
- "base64Encoded": False,
- "metadata": {
- "isText": True,
- "format": "csv"
- }
- }]
- except UnicodeDecodeError:
- logger.warning(f"Could not decode CSV from file '{fileName}' as UTF-8, trying alternative encodings")
- try:
- # Try alternative encodings for CSV
- for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
- try:
- csvContent = fileContent.decode(encoding)
- logger.info(f"CSV successfully decoded with encoding {encoding}")
- return [{
- "sequenceNr": 1,
- "name": "1_csv", # Simplified naming
- "ext": "csv",
- "contentType": "text/csv",
- "data": csvContent,
- "base64Encoded": False,
- "metadata": {
- "isText": True,
- "encoding": encoding,
- "format": "csv"
- }
- }]
- except UnicodeDecodeError:
- continue
-
- # Fallback to binary data
- return [{
- "sequenceNr": 1,
- "name": "1_binary", # Simplified naming
- "ext": "csv",
- "contentType": "text/csv",
- "data": base64.b64encode(fileContent).decode('utf-8'),
- "base64Encoded": True,
- "metadata": {
- "isText": False
- }
- }]
- except Exception as e:
- logger.error(f"Error in alternative CSV decoding: {str(e)}")
- return [{
- "sequenceNr": 1,
- "name": "1_binary", # Simplified naming
- "ext": "csv",
- "contentType": "text/csv",
- "data": base64.b64encode(fileContent).decode('utf-8'),
- "base64Encoded": True,
- "metadata": {
- "isText": False
- }
- }]
-
-def extractSvgContent(fileName: str, fileContent: bytes) -> List[Dict[str, Any]]:
- """
- Extracts content from SVG files.
-
- Args:
- fileName: Name of the file
- fileContent: Binary data of the file
-
- Returns:
- List of SVG-Content objects with dual text/image metadata
- """
- contents = []
-
- try:
- # Extract SVG as text content (XML)
- svgText = fileContent.decode('utf-8')
-
- # Check if it's actually SVG by looking for the SVG tag
- if "