# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ Extract Data action for AI operations. Extracts structured data from documents (key-value pairs, entities, facts, etc.). """ import logging from typing import Dict, Any from modules.workflows.methods.methodBase import action from modules.datamodels.datamodelChat import ActionResult logger = logging.getLogger(__name__) @action async def extractData(self, parameters: Dict[str, Any]) -> ActionResult: """ GENERAL: - Purpose: Extract structured data from documents (key-value pairs, entities, facts, etc.). - Input requirements: documentList (required); optional dataStructure, fields. - Output format: JSON by default, or specified resultType. Parameters: - documentList (list, required): Document reference(s) to extract data from. - dataStructure (str, optional): Desired data structure - flat, nested, or list. Default: nested. - fields (list, optional): Specific fields/properties to extract (e.g., ["name", "date", "amount"]). - resultType (str, optional): Output format (json, csv, xlsx, etc.). Default: json. """ documentList = parameters.get("documentList", []) if not documentList: return ActionResult.isFailure(error="documentList is required") dataStructure = parameters.get("dataStructure", "nested") fields = parameters.get("fields", []) resultType = parameters.get("resultType", "json") aiPrompt = "Extract structured data from the provided document(s)." if fields: fieldsStr = ", ".join(fields) aiPrompt += f" Extract the following specific fields: {fieldsStr}." else: aiPrompt += " Extract all relevant data including names, dates, amounts, entities, and key information." structureInstructions = { "flat": "Use a flat key-value structure with simple properties.", "nested": "Use a nested JSON structure with logical grouping of related data.", "list": "Structure the data as a list/array of objects, one per entity or record." } aiPrompt += f" {structureInstructions.get(dataStructure.lower(), structureInstructions['nested'])}" aiPrompt += " Ensure all extracted data is accurate and complete." return await self.process({ "aiPrompt": aiPrompt, "documentList": documentList, "resultType": resultType })