From 3cdd212606151e1f99c06a235403b758aeebd2be Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Sun, 4 Jan 2026 01:37:23 +0100 Subject: [PATCH] fixed renderers and metadata handover to validation --- modules/routes/routeRbac.py | 125 ++++++ .../serviceAi/subJsonResponseHandling.py | 418 ++++++++++++++++-- .../services/serviceAi/subStructureFilling.py | 104 ++++- .../serviceAi/subStructureGeneration.py | 7 + .../paths/ARCHITECTURE_ANALYSIS.md | 114 ----- .../paths/ARCHITECTURE_CHANGES.md | 77 ---- .../serviceGeneration/paths/codePath.py | 215 ++++++--- .../serviceGeneration/paths/documentPath.py | 9 +- .../renderers/documentRendererBaseTemplate.py | 21 + .../renderers/rendererCodeCsv.py | 38 +- .../renderers/rendererCodeJson.py | 52 ++- .../renderers/rendererCodeXml.py | 37 +- .../renderers/rendererCsv.py | 147 ++++-- .../renderers/rendererDocx.py | 12 + .../renderers/rendererHtml.py | 9 + .../renderers/rendererImage.py | 8 + .../renderers/rendererJson.py | 14 +- .../renderers/rendererMarkdown.py | 9 + .../renderers/rendererPdf.py | 9 + .../renderers/rendererPptx.py | 9 + .../renderers/rendererText.py | 16 + .../renderers/rendererXlsx.py | 9 + .../processing/adaptive/contentValidator.py | 240 +++++++--- 23 files changed, 1297 insertions(+), 402 deletions(-) delete mode 100644 modules/services/serviceGeneration/paths/ARCHITECTURE_ANALYSIS.md delete mode 100644 modules/services/serviceGeneration/paths/ARCHITECTURE_CHANGES.md diff --git a/modules/routes/routeRbac.py b/modules/routes/routeRbac.py index 363a6b81..8b5cf3e7 100644 --- a/modules/routes/routeRbac.py +++ b/modules/routes/routeRbac.py @@ -89,6 +89,131 @@ async def getPermissions( ) +@router.get("/permissions/all", response_model=Dict[str, Any]) +@limiter.limit("30/minute") +async def getAllPermissions( + request: Request, + context: Optional[str] = Query(None, description="Context type: UI or RESOURCE (if not provided, returns both)"), + currentUser: User = Depends(getCurrentUser) +) -> Dict[str, Any]: + """ + Get all RBAC permissions for the current user for UI and/or RESOURCE contexts. + This endpoint is optimized for UI initialization to avoid multiple API calls. + + Query Parameters: + - context: Optional context filter. If "UI", returns only UI permissions. + If "RESOURCE", returns only RESOURCE permissions. + If not provided, returns both UI and RESOURCE permissions. + + Returns: + - Dictionary with structure: + { + "ui": { + "item1": UserPermissions, + "item2": UserPermissions, + ... + }, + "resource": { + "item1": UserPermissions, + "item2": UserPermissions, + ... + } + } + If context is specified, only that context is returned. + + Example: + - GET /api/rbac/permissions/all + - GET /api/rbac/permissions/all?context=UI + - GET /api/rbac/permissions/all?context=RESOURCE + """ + try: + # Get interface and RBAC permissions + interface = getInterface(currentUser) + if not interface.rbac: + raise HTTPException( + status_code=500, + detail="RBAC interface not available" + ) + + # Determine which contexts to fetch + contextsToFetch = [] + if context: + try: + accessContext = AccessRuleContext(context.upper()) + if accessContext in [AccessRuleContext.UI, AccessRuleContext.RESOURCE]: + contextsToFetch = [accessContext] + else: + raise HTTPException( + status_code=400, + detail=f"Context '{context}' must be UI or RESOURCE for this endpoint" + ) + except ValueError: + raise HTTPException( + status_code=400, + detail=f"Invalid context '{context}'. Must be UI or RESOURCE" + ) + else: + # Return both UI and RESOURCE if no context specified + contextsToFetch = [AccessRuleContext.UI, AccessRuleContext.RESOURCE] + + result: Dict[str, Any] = {} + + # Get all access rules for user's roles + roleLabels = currentUser.roleLabels or [] + if not roleLabels: + # User has no roles, return empty permissions + for ctx in contextsToFetch: + result[ctx.value.lower()] = {} + return result + + # Get all access rules for user's roles and requested contexts + allRules: Dict[AccessRuleContext, List[AccessRule]] = {} + for ctx in contextsToFetch: + allRules[ctx] = [] + # Get all rules for user's roles in this context + for roleLabel in roleLabels: + rules = interface.getAccessRules( + roleLabel=roleLabel, + context=ctx, + pagination=None + ) + allRules[ctx].extend(rules) + + # Build result: for each context, collect all unique items and calculate permissions + for ctx in contextsToFetch: + result[ctx.value.lower()] = {} + + # Collect all unique items from rules + items = set() + for rule in allRules[ctx]: + if rule.item: + items.add(rule.item) + + # For each item, calculate user permissions + for item in sorted(items): + permissions = interface.rbac.getUserPermissions(currentUser, ctx, item) + # Only include if user has view permission + if permissions.view: + result[ctx.value.lower()][item] = { + "view": permissions.view, + "read": permissions.read.value if permissions.read else None, + "create": permissions.create.value if permissions.create else None, + "update": permissions.update.value if permissions.update else None, + "delete": permissions.delete.value if permissions.delete else None + } + + return result + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error getting all RBAC permissions: {str(e)}") + raise HTTPException( + status_code=500, + detail=f"Failed to get all permissions: {str(e)}" + ) + + @router.get("/rules", response_model=PaginatedResponse) @limiter.limit("30/minute") async def getAccessRules( diff --git a/modules/services/serviceAi/subJsonResponseHandling.py b/modules/services/serviceAi/subJsonResponseHandling.py index d9e6d0af..0e8820f2 100644 --- a/modules/services/serviceAi/subJsonResponseHandling.py +++ b/modules/services/serviceAi/subJsonResponseHandling.py @@ -397,22 +397,59 @@ class JsonResponseHandler: elif contentType == "table": # Merge table rows with sophisticated overlap detection - existingRows = existingElem.get("rows", []) - newRows = newElem.get("rows", []) + # CRITICAL: Tables can have rows in two places: + # 1. Direct: existingElem["rows"] (legacy format) + # 2. Nested: existingElem["content"]["rows"] (current format) + existingRows = None + newRows = None + + # Check nested structure first (current format) + if "content" in existingElem and isinstance(existingElem["content"], dict): + existingRows = existingElem["content"].get("rows", []) + # Fallback to direct structure (legacy format) + if not existingRows: + existingRows = existingElem.get("rows", []) + + # Check nested structure first (current format) + if "content" in newElem and isinstance(newElem["content"], dict): + newRows = newElem["content"].get("rows", []) + # Fallback to direct structure (legacy format) + if not newRows: + newRows = newElem.get("rows", []) + if existingRows and newRows: # Use sophisticated overlap detection that handles multiple overlapping rows mergedRows = JsonResponseHandler.mergeRowsWithOverlap(existingRows, newRows, iteration) - existingElem["rows"] = mergedRows + # Store in nested structure (current format) + if "content" not in existingElem: + existingElem["content"] = {} + existingElem["content"]["rows"] = mergedRows + # Also set type if missing + if "type" not in existingElem: + existingElem["type"] = "table" logger.debug(f"Iteration {iteration}: Merged table rows - existing: {len(existingRows)}, new: {len(newRows)}, total: {len(mergedRows)}") elif newRows: # If existing has no rows but new does, use new rows - existingElem["rows"] = newRows + if "content" not in existingElem: + existingElem["content"] = {} + existingElem["content"]["rows"] = newRows + if "type" not in existingElem: + existingElem["type"] = "table" # Preserve headers from existing (or use new if existing has none) - if not existingElem.get("headers") and newElem.get("headers"): - existingElem["headers"] = newElem["headers"] + # Headers can be in content.headers or directly in element + existingHeaders = existingElem.get("content", {}).get("headers", []) if "content" in existingElem else existingElem.get("headers", []) + newHeaders = newElem.get("content", {}).get("headers", []) if "content" in newElem else newElem.get("headers", []) + if not existingHeaders and newHeaders: + if "content" not in existingElem: + existingElem["content"] = {} + existingElem["content"]["headers"] = newHeaders # Preserve caption from existing (or use new if existing has none) - if not existingElem.get("caption") and newElem.get("caption"): - existingElem["caption"] = newElem.get("caption") + existingCaption = existingElem.get("content", {}).get("caption") if "content" in existingElem else existingElem.get("caption") + newCaption = newElem.get("content", {}).get("caption") if "content" in newElem else newElem.get("caption") + if not existingCaption and newCaption: + if "content" not in existingElem: + existingElem["content"] = {} + existingElem["content"]["caption"] = newCaption elif contentType in ["bullet_list", "numbered_list"]: # Merge list items with sophisticated overlap detection @@ -683,13 +720,13 @@ class JsonResponseHandler: last_element = {} elements.append(last_element) - # CRITICAL: Use ONLY deep recursive merging for ALL fragment types - # This handles ANY structure: arrays, objects, nested, primitives - # Handles overlap detection generically (deep recursive comparison) - # Handles continuation after cut-off (no overlap case) - merged_element = JsonResponseHandler.mergeDeepStructures( + # CRITICAL: GENERIC fragment merging for ALL structure types + # Automatically detects the structure type and merges accordingly + # Works for: tables, lists, code blocks, paragraphs, images, and any nested structures + merged_element = JsonResponseHandler._mergeFragmentIntoElement( last_element, fragment_data, + target_section, iteration, f"section.{target_section_id}.fragment" ) @@ -1062,6 +1099,202 @@ class JsonResponseHandler: logger.debug(f"Iteration {iteration}: Primitive at {path} differs, using new value") return new + @staticmethod + def _mergeFragmentIntoElement( + last_element: Dict[str, Any], + fragment_data: Any, + target_section: Dict[str, Any], + iteration: int, + path: str + ) -> Dict[str, Any]: + """ + GENERIC fragment merging for ALL structure types. + + Automatically detects the structure type and merges fragments accordingly. + Works for: tables, lists, code blocks, paragraphs, images, and any nested structures. + + Strategy: + 1. Analyze last_element structure to determine content location (content.rows, content.items, etc.) + 2. Detect fragment type (array, object, primitive) + 3. Merge fragment into appropriate location using mergeDeepStructures + + Args: + last_element: The existing element to merge into + fragment_data: The fragment data to merge (can be any JSON structure) + target_section: The target section (for content_type detection) + iteration: Current iteration number + path: Path for logging + + Returns: + Merged element + """ + contentType = target_section.get("content_type", "") + elementType = last_element.get("type", "") + + # Determine the content structure path based on element type and content type + # This handles both nested (content.rows) and flat (rows) structures + contentPath = None + fragmentIsArray = isinstance(fragment_data, list) and len(fragment_data) > 0 + + # Detect structure type and determine merge path + if contentType == "table" or elementType == "table": + # Tables: merge into content.rows or rows + if "content" in last_element and isinstance(last_element["content"], dict): + contentPath = "content.rows" + else: + contentPath = "rows" + elif contentType in ["bullet_list", "numbered_list", "list"] or elementType in ["bullet_list", "numbered_list", "list"]: + # Lists: merge into content.items or items + if "content" in last_element and isinstance(last_element["content"], dict): + contentPath = "content.items" + else: + contentPath = "items" + elif contentType == "code_block" or elementType == "code_block": + # Code blocks: merge into content.code or code + if "content" in last_element and isinstance(last_element["content"], dict): + contentPath = "content.code" + else: + contentPath = "code" + elif contentType in ["paragraph", "heading"] or elementType in ["paragraph", "heading"]: + # Text: merge into content.text or text + if "content" in last_element and isinstance(last_element["content"], dict): + contentPath = "content.text" + else: + contentPath = "text" + elif contentType == "image" or elementType == "image": + # Images: merge into base64Data + contentPath = "base64Data" + + # If we have a specific content path, merge into that location + if contentPath: + # Split path (e.g., "content.rows" -> ["content", "rows"]) + pathParts = contentPath.split(".") + + # Ensure nested structure exists + current = last_element + for i, part in enumerate(pathParts[:-1]): + if part not in current: + current[part] = {} + elif not isinstance(current[part], dict): + current[part] = {} + current = current[part] + + # Get existing content at target path + targetKey = pathParts[-1] + existingContent = current.get(targetKey, []) + + # Merge fragment into existing content + # CRITICAL: Handle both array fragments and object fragments generically + if fragmentIsArray: + # Fragment is an array - merge arrays + if isinstance(existingContent, list): + # Check if fragment is array of arrays (e.g., table rows) or array of primitives + if len(fragment_data) > 0 and isinstance(fragment_data[0], list): + # Array of arrays - use rows merge for tables, generic merge for others + if contentPath.endswith(".rows"): + mergedContent = JsonResponseHandler.mergeRowsWithOverlap(existingContent, fragment_data, iteration) + else: + # Generic array-of-arrays merge + mergedContent = JsonResponseHandler.mergeDeepStructures( + existingContent, + fragment_data, + iteration, + f"{path}.{targetKey}" + ) + else: + # Array of primitives - use items merge for lists, generic merge for others + if contentPath.endswith(".items"): + mergedContent = JsonResponseHandler.mergeItemsWithOverlap(existingContent, fragment_data, iteration) + else: + # Generic array merge using mergeDeepStructures + mergedContent = JsonResponseHandler.mergeDeepStructures( + existingContent, + fragment_data, + iteration, + f"{path}.{targetKey}" + ) + else: + # Existing content is not a list - replace with fragment + mergedContent = fragment_data + elif isinstance(fragment_data, dict): + # Fragment is an object - check if it contains nested content (e.g., {"content": {"rows": [...]}}) + # If fragment has same structure as target, merge nested content + if "content" in fragment_data and isinstance(fragment_data["content"], dict): + fragmentNested = fragment_data["content"] + # Check if fragment has the same key as our target (e.g., fragment.content.rows) + if targetKey in fragmentNested: + # Fragment has nested content matching our target - merge that content + fragmentNestedContent = fragmentNested[targetKey] + if isinstance(existingContent, list) and isinstance(fragmentNestedContent, list): + # Both are lists - merge them + if contentPath.endswith(".rows"): + mergedContent = JsonResponseHandler.mergeRowsWithOverlap(existingContent, fragmentNestedContent, iteration) + elif contentPath.endswith(".items"): + mergedContent = JsonResponseHandler.mergeItemsWithOverlap(existingContent, fragmentNestedContent, iteration) + else: + mergedContent = JsonResponseHandler.mergeDeepStructures( + existingContent, + fragmentNestedContent, + iteration, + f"{path}.{targetKey}" + ) + else: + # Use deep merge for nested content + mergedContent = JsonResponseHandler.mergeDeepStructures( + existingContent if existingContent else {}, + fragmentNestedContent, + iteration, + f"{path}.{targetKey}" + ) + else: + # Fragment has different structure - merge entire fragment object + mergedContent = JsonResponseHandler.mergeDeepStructures( + existingContent if existingContent else {}, + fragment_data, + iteration, + f"{path}.{targetKey}" + ) + else: + # Fragment is a simple object - use deep merge + mergedContent = JsonResponseHandler.mergeDeepStructures( + existingContent if existingContent else {}, + fragment_data, + iteration, + f"{path}.{targetKey}" + ) + else: + # Fragment is a primitive or unknown type - use deep merge + mergedContent = JsonResponseHandler.mergeDeepStructures( + existingContent if existingContent else {}, + fragment_data, + iteration, + f"{path}.{targetKey}" + ) + + # Update the merged content + current[targetKey] = mergedContent + + # Ensure type is set + if elementType and "type" not in last_element: + last_element["type"] = elementType + elif contentType and "type" not in last_element: + last_element["type"] = contentType + + logger.info(f"Iteration {iteration}: ✅ Merged fragment into {contentPath} for section '{target_section.get('id')}'") + return last_element + + # No specific content path - use generic deep merge + # This handles any structure type generically + merged_element = JsonResponseHandler.mergeDeepStructures( + last_element, + fragment_data, + iteration, + path + ) + + logger.info(f"Iteration {iteration}: ✅ Merged GENERIC fragment (type: {type(fragment_data).__name__}) into section '{target_section.get('id')}'") + return merged_element + @staticmethod def cleanEncodingIssues(jsonString: str) -> str: """ @@ -1091,26 +1324,160 @@ class JsonResponseHandler: newFragment: str ) -> str: """ - GENERIC function to merge two JSON strings, handling overlaps intelligently. + ROBUST generic function to merge two JSON strings, handling fragments cut anywhere. - Works for ANY JSON structure - no specific logic for content types. + Works for ANY JSON structure - handles cuts at beginning, middle, or end. - Overlap scenarios (all handled generically): + Fragment scenarios (all handled): + - Cut at beginning: newFragment starts mid-structure (e.g., `["item1", ...]`) + - Cut in middle: newFragment continues incomplete structure from accumulated + - Cut at end: accumulated ends mid-structure (e.g., `["item1", "item2", `) + - Full overlap: newFragment overlaps with end of accumulated - Exact continuation: newFragment starts exactly where accumulated ends - - Partial overlap: newFragment overlaps with end of accumulated - - Full overlap: newFragment is subset of accumulated Strategy: - 1. Find longest common suffix/prefix match (string-based comparison) - 2. Remove duplicate content - 3. Concatenate remaining parts + 1. Extract JSON from both strings (handles code fences, extra text) + 2. Detect if newFragment is a fragment (doesn't start with { or [ as root) + 3. Detect if accumulated ends mid-structure (incomplete) + 4. Repair incomplete structures using existing jsonUtils + 5. Merge intelligently based on structure analysis Args: - accumulated: Previously accumulated JSON string - newFragment: New fragment string to append + accumulated: Previously accumulated JSON string (may be incomplete) + newFragment: New fragment string to append (may be a fragment) Returns: - Combined JSON string with overlaps removed + Combined JSON string with fragments properly merged + """ + if not accumulated: + return newFragment + if not newFragment: + return accumulated + + # Step 1: Extract JSON from both strings (handles code fences, extra text) + from modules.shared.jsonUtils import extractJsonString, closeJsonStructures, tryParseJson + + accumulatedExtracted = extractJsonString(accumulated) + newFragmentExtracted = extractJsonString(newFragment) + + # Step 2: Detect fragment type and incomplete structures + accumulatedStripped = accumulatedExtracted.strip() + newFragmentStripped = newFragmentExtracted.strip() + + # Check if accumulated ends mid-structure + accumulatedEndsMidStructure = False + accumulatedParsed, accumulatedParseErr, _ = tryParseJson(accumulatedExtracted) + if accumulatedParseErr is not None: + # Cannot parse - likely incomplete + accumulatedEndsMidStructure = True + elif accumulatedStripped: + # Check if it ends with incomplete patterns (comma, incomplete string, etc.) + lastChar = accumulatedStripped[-1] if accumulatedStripped else '' + # Ends with comma - likely incomplete array/object element + if lastChar == ',' or accumulatedStripped.rstrip().endswith(','): + accumulatedEndsMidStructure = True + # Ends with incomplete string pattern (quote but no closing) + elif lastChar == '"' and accumulatedStripped.count('"') % 2 != 0: + accumulatedEndsMidStructure = True + + # Check if newFragment is a fragment + # A fragment can be: + # 1. Doesn't start with { or [ as root (plain text continuation) + # 2. Starts with [ but is part of a larger array (e.g., continuation of table rows) + # 3. Starts with { but is part of a larger object + isNewFragment = False + newFragmentParsed, newFragmentParseErr, _ = tryParseJson(newFragmentExtracted) + + if newFragmentParseErr is not None: + # Cannot parse - it's a fragment + isNewFragment = True + elif not (newFragmentStripped.startswith('{') or newFragmentStripped.startswith('[')): + # Doesn't start with JSON structure - it's a fragment + isNewFragment = True + elif accumulatedEndsMidStructure: + # Accumulated is incomplete - newFragment is likely a continuation fragment + # Even if it starts with [ or {, it might be continuing an incomplete structure + isNewFragment = True + + # Step 3: Handle fragment merging + if isNewFragment or accumulatedEndsMidStructure: + # This is a fragment continuation - merge by repairing and combining + + # Strategy: String-based merging for fragments + # 1. Remove trailing comma from accumulated if it ends with comma + accumulatedForMerge = accumulatedExtracted + if accumulatedStripped.rstrip().endswith(','): + # Remove trailing comma and whitespace + accumulatedForMerge = accumulatedExtracted.rstrip().rstrip(',').rstrip() + + # 2. Handle newFragment - if it starts with [ or {, it might be continuing an array/object + newFragmentForMerge = newFragmentExtracted.strip() + + # Special case: If accumulated ends with incomplete array element and newFragment starts with array element + # Pattern: accumulated ends with `["item1", "item2", ` and newFragment starts with `["item3", ...]` + # We need to merge them: `["item1", "item2", "item3", ...]` + if accumulatedStripped.rstrip().endswith(',') and newFragmentStripped.startswith('['): + # Check if newFragment is a complete array element or just starts with [ + # If it's a complete array element, we need to extract its content and merge + # Try to parse newFragment as a complete array + newFragmentArrayParsed, newFragmentArrayErr, _ = tryParseJson(newFragmentStripped) + if newFragmentArrayErr is None and isinstance(newFragmentArrayParsed, list): + # newFragment is a complete array - extract its content + # We need to merge the arrays: accumulated array + newFragment array + # But accumulated ends with comma, so we need to close it first + # Strategy: Remove trailing comma, add closing bracket, then merge arrays + accumulatedClosed = accumulatedForMerge + ']' + accumulatedClosedParsed, accumulatedClosedErr, _ = tryParseJson(accumulatedClosed) + if accumulatedClosedErr is None: + # Both are valid - merge parsed arrays + if isinstance(accumulatedClosedParsed, list): + mergedArray = accumulatedClosedParsed + newFragmentArrayParsed + # Now we need to reconstruct the JSON structure + # Find where the array starts in accumulated + # For now, use string replacement + # Find the last [ before the comma + lastBracketPos = accumulatedForMerge.rfind('[') + if lastBracketPos >= 0: + # Replace from [ to end with merged array + merged = accumulatedForMerge[:lastBracketPos] + json.dumps(mergedArray, ensure_ascii=False) + # Try to repair and parse + mergedRepaired = closeJsonStructures(merged) + mergedParsed, mergedErr, _ = tryParseJson(mergedRepaired) + if mergedErr is None and mergedParsed: + return json.dumps(mergedParsed, indent=2, ensure_ascii=False) + + # 3. Merge strings: accumulated (without trailing comma) + fragment + merged = accumulatedForMerge + newFragmentForMerge + + # 4. Try to repair the merged result + mergedRepaired = closeJsonStructures(merged) + + # 5. Try to parse the repaired result + mergedParsed, mergedErr, _ = tryParseJson(mergedRepaired) + + if mergedErr is None and mergedParsed: + # Successfully parsed - return formatted JSON + return json.dumps(mergedParsed, indent=2, ensure_ascii=False) + else: + # Still can't parse - try overlap detection as fallback + logger.debug(f"Fragment merge repair failed, trying overlap detection: {mergedErr}") + return JsonResponseHandler._mergeJsonStringsWithOverlapFallback( + accumulatedExtracted, newFragmentExtracted + ) + else: + # Both are complete JSON - use original overlap detection + return JsonResponseHandler._mergeJsonStringsWithOverlapFallback( + accumulatedExtracted, newFragmentExtracted + ) + + @staticmethod + def _mergeJsonStringsWithOverlapFallback( + accumulated: str, + newFragment: str + ) -> str: + """ + Fallback overlap detection using string comparison. + Used when both strings are complete JSON structures. """ if not accumulated: return newFragment @@ -1118,12 +1485,9 @@ class JsonResponseHandler: return accumulated # Find longest common suffix/prefix match - # Try different overlap lengths (from longest to shortest) - # Overlaps can be as small as 1 character, so we check all possible lengths maxOverlapLen = min(len(accumulated), len(newFragment)) # Start from maximum possible overlap down to 1 character - # This ensures we find the longest overlap, even if it's just 1 character for overlapLen in range(maxOverlapLen, 0, -1): accumulatedSuffix = accumulated[-overlapLen:] newFragmentPrefix = newFragment[:overlapLen] diff --git a/modules/services/serviceAi/subStructureFilling.py b/modules/services/serviceAi/subStructureFilling.py index 3d687398..cad01980 100644 --- a/modules/services/serviceAi/subStructureFilling.py +++ b/modules/services/serviceAi/subStructureFilling.py @@ -213,15 +213,16 @@ class StructureFiller: if not isinstance(doc["language"], str) or len(doc["language"]) != 2: raise ValueError(f"Document {doc.get('id')} has invalid language format in filled structure: {doc['language']} - should be 2-character ISO 639-1 code") - for chapter in doc.get("chapters", []): - for section in chapter.get("sections", []): - # Validation 4.2: Section missing 'elements' field - if "elements" not in section: - section["elements"] = [] - logger.info(f"Section {section.get('id')} missing 'elements' - created empty list") - - # Validation 4.3: Section has empty elements list - ALLOW (intentionally empty is OK) - # No action needed - empty elements are allowed + # CRITICAL: flattenedStructure has sections, not chapters! + # After flattening, chapters are converted to sections, so we need to validate sections directly + for section in doc.get("sections", []): + # Validation 4.2: Section missing 'elements' field + if "elements" not in section: + section["elements"] = [] + logger.info(f"Section {section.get('id')} missing 'elements' - created empty list") + + # Validation 4.3: Section has empty elements list - ALLOW (intentionally empty is OK) + # No action needed - empty elements are allowed # ChatLog abschließen self.services.chat.progressLogFinish(fillOperationId, True) @@ -246,6 +247,7 @@ class StructureFiller: contentParts: List[ContentPart], userPrompt: str, language: str, + outputFormat: str, parentOperationId: str, totalChapters: int ) -> None: @@ -271,7 +273,8 @@ class StructureFiller: contentPartInstructions=contentPartInstructions, contentParts=contentParts, userPrompt=userPrompt, - language=language + language=language, + outputFormat=outputFormat ) # AI-Call für Chapter-Struktur-Generierung @@ -372,6 +375,8 @@ class StructureFiller: docId = doc.get("id", "unknown") # Get language for this specific document docLanguage = self._getDocumentLanguage(chapterStructure, docId) + # Get output format for this specific document + docFormat = doc.get("outputFormat", "txt") for chapter in doc.get("chapters", []): chapterIndex += 1 @@ -382,7 +387,7 @@ class StructureFiller: contentPartIds, contentPartInstructions = self._extractContentPartInfo(chapter) # Create task for parallel processing with semaphore - async def processChapterWithSemaphore(chapter, chapterIndex, chapterId, chapterLevel, chapterTitle, generationHint, contentPartIds, contentPartInstructions, docLanguage): + async def processChapterWithSemaphore(chapter, chapterIndex, chapterId, chapterLevel, chapterTitle, generationHint, contentPartIds, contentPartInstructions, docLanguage, docFormat): checkWorkflowStopped(self.services) async with semaphore: return await self._generateSingleChapterSectionsStructure( @@ -397,12 +402,13 @@ class StructureFiller: contentParts=contentParts, userPrompt=userPrompt, language=docLanguage, # Use document-specific language + outputFormat=docFormat, # Use document-specific format parentOperationId=parentOperationId, totalChapters=totalChapters ) task = processChapterWithSemaphore( - chapter, chapterIndex, chapterId, chapterLevel, chapterTitle, generationHint, contentPartIds, contentPartInstructions, docLanguage + chapter, chapterIndex, chapterId, chapterLevel, chapterTitle, generationHint, contentPartIds, contentPartInstructions, docLanguage, docFormat ) chapterTasks.append((chapterIndex, chapter, task)) @@ -1725,7 +1731,9 @@ The JSON should be a fragment that can be merged with the previous response.""" }] else: # Assign elements to section in correct order - originalSection["elements"] = result + # CRITICAL: Always assign elements, even if empty list + # This ensures sections always have an elements field for validation + originalSection["elements"] = result if result is not None else [] # Finish chapter operation after all sections processed self.services.chat.progressLogFinish(chapterOperationId, True) @@ -1830,7 +1838,13 @@ The JSON should be a fragment that can be merged with the previous response.""" # 2. Generierte Sections - adjust heading levels for section in chapter.get("sections", []): + # CRITICAL: Ensure elements are preserved when flattening + # _adjustSectionHeadingLevels uses deepcopy which should preserve elements, + # but verify that elements exist in the source section adjusted_section = self._adjustSectionHeadingLevels(section) + # Ensure elements are preserved (deepcopy should handle this, but double-check) + if "elements" in section and "elements" not in adjusted_section: + adjusted_section["elements"] = section["elements"] flattened_doc["sections"].append(adjusted_section) result["documents"].append(flattened_doc) @@ -1868,9 +1882,10 @@ The JSON should be a fragment that can be merged with the previous response.""" contentPartInstructions: Dict[str, Any], contentParts: List[ContentPart], userPrompt: str, - language: str = "en" + language: str = "en", + outputFormat: str = "txt" ) -> str: - """Baue Prompt für Chapter-Sections-Struktur-Generierung.""" + """Baue Prompt für Chapter-Sections-Struktur-Generierung, querying renderer for accepted section types.""" # Baue ContentParts-Index (nur IDs, keine Previews!) contentPartsIndex = "" for partId in contentPartIds: @@ -1904,6 +1919,9 @@ The JSON should be a fragment that can be merged with the previous response.""" if not contentPartsIndex: contentPartsIndex = "\n(No content parts specified for this chapter)" + # Query renderer for accepted section types + acceptedSectionTypes = self._getAcceptedSectionTypesForFormat(outputFormat) + prompt = f"""TASK: Generate Chapter Sections Structure LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}. @@ -1936,6 +1954,21 @@ If AVAILABLE CONTENT PARTS are listed above, then EVERY section that generates c ## CONTENT TYPES Available content types for sections: table, bullet_list, heading, paragraph, code_block, image +## ACCEPTED SECTION TYPES FOR THIS FORMAT +The document output format ({outputFormat}) accepts only the following section types: +{', '.join(acceptedSectionTypes) if acceptedSectionTypes else 'All section types'} + +**IMPORTANT**: Only create sections with content types from the accepted list above. Do not create sections with types that are not accepted by this format. + +## FORMAT-APPROPRIATE SECTION STRUCTURE +When determining which sections to create for this chapter, consider the document's output format ({outputFormat}) and ensure sections are structured appropriately for that format: +- Different formats have different capabilities and constraints +- Structure sections to match what the format can effectively represent +- Consider what content types work best for each format +- Ensure the section structure aligns with the format's strengths and limitations +- Select content types that are well-suited for the target format +- **CRITICAL**: Only use section types from the ACCEPTED SECTION TYPES list above + useAiCall RULES: - useAiCall: true ONLY if ContentPart Format is "extracted" AND transformation needed - useAiCall: false if Format is "object" or "reference" (direct insertion) @@ -2102,9 +2135,6 @@ Return only valid JSON. Do not include any explanatory text outside the JSON. contentStructureExample = self._getContentStructureExample(contentType) - # Special handling for image content type with IMAGE_GENERATE - isImageGeneration = contentType == "image" and len(validParts) == 0 - if isAggregation: prompt = f"""# TASK: Generate Section Content (Aggregation) @@ -2126,6 +2156,8 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles, 5. For table: Extract all rows from the context. Return {{"headers": [...], "rows": []}} only if no data exists. 6. Format based on content_type ({contentType}). 7. No HTML/styling: Plain text only, no markup. +8. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself. + ## OUTPUT FORMAT Return a JSON object with this structure: @@ -2177,6 +2209,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles, 3. Format based on content_type ({contentType}). 4. Return only valid JSON with "elements" array. 5. No HTML/styling: Plain text only, no markup. +6. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself. ## OUTPUT FORMAT Return a JSON object with this structure: @@ -2221,6 +2254,7 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles, 3. The content should be relevant to the USER REQUEST and fit the context of surrounding sections. 4. Return only valid JSON with "elements" array. 5. No HTML/styling: Plain text only, no markup. +6. CONTINUE UNTIL COMPLETE: Extract ALL data from the provided context. Do NOT stop early because you think the response might be too long. Do NOT truncate or abbreviate. Do not impose artificial limits on yourself. ## OUTPUT FORMAT Return a JSON object with this structure: @@ -2547,4 +2581,38 @@ Output requirements: # (z.B. Vergleich mehrerer Dokumente) # Standard: Keine Aggregation für paragraph return False + + def _getAcceptedSectionTypesForFormat(self, outputFormat: str) -> List[str]: + """ + Get accepted section types for a given output format by querying the renderer. + + Args: + outputFormat: Format name (e.g., 'csv', 'json', 'pdf') + + Returns: + List of accepted section content types (e.g., ["table", "code_block"]) + """ + try: + from modules.services.serviceGeneration.renderers.registry import getRenderer + + # Get renderer for this format + renderer = getRenderer(outputFormat, self.services) + + if renderer and hasattr(renderer, 'getAcceptedSectionTypes'): + # Query renderer for accepted types + acceptedTypes = renderer.getAcceptedSectionTypes(outputFormat) + if acceptedTypes: + logger.debug(f"Renderer for format '{outputFormat}' accepts section types: {acceptedTypes}") + return acceptedTypes + + # Fallback: if no renderer or method not found, return all types + from modules.datamodels.datamodelJson import supportedSectionTypes + logger.debug(f"No renderer found for format '{outputFormat}' or method not available, using all section types") + return list(supportedSectionTypes) + + except Exception as e: + logger.warning(f"Error querying renderer for accepted section types for format '{outputFormat}': {str(e)}") + # Fallback: return all types + from modules.datamodels.datamodelJson import supportedSectionTypes + return list(supportedSectionTypes) diff --git a/modules/services/serviceAi/subStructureGeneration.py b/modules/services/serviceAi/subStructureGeneration.py index c6774fc3..f029a432 100644 --- a/modules/services/serviceAi/subStructureGeneration.py +++ b/modules/services/serviceAi/subStructureGeneration.py @@ -379,6 +379,13 @@ For each document, determine the output format by analyzing the USER REQUEST: - Include "outputFormat" field in each document in the JSON structure - Multiple documents can have different formats +## FORMAT-APPROPRIATE CHAPTER STRUCTURE +When determining the chapter structure, consider the document's output format and ensure chapters are structured appropriately for that format: +- Different formats have different capabilities and constraints +- Structure chapters to match what the format can effectively represent +- Consider what content types work best for each format +- Ensure the chapter structure aligns with the format's strengths and limitations + ## DOCUMENT LANGUAGE For each document, determine the language by analyzing the USER REQUEST: - Look for explicit language mentions diff --git a/modules/services/serviceGeneration/paths/ARCHITECTURE_ANALYSIS.md b/modules/services/serviceGeneration/paths/ARCHITECTURE_ANALYSIS.md deleted file mode 100644 index 5ba586a7..00000000 --- a/modules/services/serviceGeneration/paths/ARCHITECTURE_ANALYSIS.md +++ /dev/null @@ -1,114 +0,0 @@ -# Document Generation Architecture Analysis - -## Current Flow - -### 1. Document Input → ContentParts (`extractAndPrepareContent`) - -**Location**: `gateway/modules/services/serviceAi/subContentExtraction.py` - -**Flow**: -- Regular documents → Calls `extractContent()` (NON-AI extraction) → Creates contentParts with raw extracted text -- **BUT THEN**: - - Images with "extract" intent → Calls Vision AI (line 190) → AI extraction - - Text with "extract" intent + extractionPrompt → Calls AI processing (line 265) → AI extraction -- Pre-extracted JSON → Uses contentParts directly (no AI) - -**Result**: ContentParts may already be AI-processed before structure generation - -### 2. Structure Generation - -**Location**: `gateway/modules/services/serviceAi/subStructureGeneration.py` - -**Flow**: -- Uses contentParts (may already be AI-processed) -- Generates document structure (chapters, sections) - -### 3. Section Generation (`_processSingleSection`) - -**Location**: `gateway/modules/services/serviceAi/subStructureFilling.py` - -**Flow**: -- Uses contentParts (which may already be AI-processed) -- Aggregates "extracted" contentParts with AI (line 554-682) -- Generates section content using `callAiWithLooping` with `useCaseId="section_content"` - -## Issues Identified - -### Issue 1: Duplicate AI Processing -- AI extraction happens in `extractAndPrepareContent` (for images/text) -- AI generation happens again in section generation -- This is redundant and inefficient - -### Issue 2: Architecture Inconsistency -- Pre-extracted JSON files → contentParts directly (no AI) -- Regular documents → contentParts + AI extraction (inconsistent) -- User wants: Documents → contentParts (like pre-extracted JSON) → AI only in section generation - -### Issue 3: Image Processing -- Images need Vision AI to extract text -- Currently happens in `extractAndPrepareContent` -- Question: Should this happen during section generation instead? - -## Proposed Architecture - -### Option A: Remove All AI from `extractAndPrepareContent` -- Documents → `extractContent()` → Raw contentParts (text, tables, etc.) -- Images → Keep as image contentParts (no Vision AI extraction) -- Section generation → Handle images with Vision AI when needed - -**Pros**: -- Consistent with pre-extracted JSON flow -- Single point of AI processing (section generation) -- Clear separation of concerns - -**Cons**: -- Images won't have extracted text until section generation -- May need to handle images differently in section generation - -### Option B: Keep Vision AI for Images Only -- Documents → `extractContent()` → Raw contentParts -- Images → Vision AI extraction → Text contentParts -- Section generation → Uses text contentParts (no additional AI extraction) - -**Pros**: -- Images get text extracted early -- Section generation can use text directly - -**Cons**: -- Still has AI extraction before structure generation -- Inconsistent with user's request - -## Recommendation - -**Follow Option A** - Remove all AI extraction from `extractAndPrepareContent`: - -1. **Documents → ContentParts** (like pre-extracted JSON): - - Call `extractContent()` (NON-AI) - - Create contentParts with raw extracted content - - Images remain as image contentParts (no Vision AI) - -2. **Section Generation**: - - Handle images with Vision AI when needed - - Aggregate all contentParts with AI - - Single point of AI processing - -**Benefits**: -- Clear architecture: Documents = raw contentParts -- Consistent with pre-extracted JSON flow -- AI processing only where needed (section generation) -- Easier to understand and maintain - -## Questions to Resolve - -1. **Image handling**: How should images be processed during section generation? - - Option 1: Vision AI extraction happens automatically when image contentParts are used - - Option 2: Images are passed to AI with Vision models during section generation - - Option 3: Images remain as binary and are rendered directly (no text extraction) - -2. **Text with extractionPrompt**: Should text contentParts with extractionPrompt be processed differently? - - Currently: AI processing in `extractAndPrepareContent` - - Proposed: Raw text → AI processing during section generation - -3. **Performance**: Will deferring image extraction to section generation cause performance issues? - - Need to test with multiple images - diff --git a/modules/services/serviceGeneration/paths/ARCHITECTURE_CHANGES.md b/modules/services/serviceGeneration/paths/ARCHITECTURE_CHANGES.md deleted file mode 100644 index 3af38ef4..00000000 --- a/modules/services/serviceGeneration/paths/ARCHITECTURE_CHANGES.md +++ /dev/null @@ -1,77 +0,0 @@ -# Architecture Changes Summary - -## Problem Identified - -The architecture had AI extraction happening in TWO places: -1. **`extractAndPrepareContent`**: Vision AI for images, AI processing for text with extractionPrompt -2. **Section generation**: AI aggregation of contentParts - -This was: -- Redundant (double AI processing) -- Inconsistent (pre-extracted JSON had no AI, regular documents had AI) -- Against the desired architecture (documents should become contentParts like pre-extracted JSON) - -## Solution Implemented - -### 1. Removed AI Extraction from `extractAndPrepareContent` - -**File**: `gateway/modules/services/serviceAi/subContentExtraction.py` - -**Changes**: -- **Removed**: Vision AI extraction for images (lines 186-246) -- **Removed**: AI text processing with extractionPrompt (lines 260-334) -- **Updated**: Images with extract intent are now marked with `needsVisionExtraction=True` flag -- **Updated**: Regular documents mark images with `needsVisionExtraction=True` when extract intent is present - -**Result**: Documents → contentParts (raw extraction only, no AI) - -### 2. Added Vision AI Extraction in Section Generation - -**File**: `gateway/modules/services/serviceAi/subStructureFilling.py` - -**Changes**: -- **Added**: Vision AI extraction logic before aggregation (lines 553-610) -- **Added**: Vision AI extraction logic for single-part processing (lines 1074-1115) -- **Logic**: - - Checks if `part.typeGroup == "image"` AND `needsVisionExtraction == True` AND `intent == "extract"` - - Extracts text using Vision AI (`IMAGE_ANALYSE` operation) - - Replaces image part with text part for further processing - - Images with `contentFormat == "object"` (render intent) are rendered directly (no extraction) - -**Result**: AI extraction happens ONLY during section generation - -## Architecture Flow (After Changes) - -### Document Input → ContentParts -1. **Regular documents**: `extractContent()` (NON-AI) → Raw contentParts - - Images with extract intent: `contentFormat="extracted"`, `needsVisionExtraction=True` - - Images with render intent: `contentFormat="object"` (rendered directly) - - Text: `contentFormat="extracted"` (raw text, no AI processing) - -2. **Pre-extracted JSON**: Direct contentParts (no changes) - -### Section Generation → AI Processing -1. **Images with extract intent**: Vision AI extraction → Text part → AI aggregation -2. **Images with render intent**: Rendered directly (no extraction) -3. **Text contentParts**: AI aggregation with extractionPrompt (if provided) - -## Key Benefits - -1. **Consistent Architecture**: Documents = raw contentParts (like pre-extracted JSON) -2. **Single Point of AI Processing**: Only in section generation -3. **Clear Separation**: Extraction vs Generation -4. **Intent-Based Logic**: - - `intent == "extract"` → Vision AI extraction during section generation - - `intent == "render"` → Direct rendering (no extraction) - - `contentFormat == "object"` → Embedded/referenced images (no extraction) - -## Testing Checklist - -- [ ] Regular documents create contentParts without AI extraction -- [ ] Images with extract intent are marked with `needsVisionExtraction=True` -- [ ] Images with render intent are marked with `contentFormat="object"` -- [ ] Section generation extracts images with Vision AI when needed -- [ ] Section generation renders images with object format directly -- [ ] Text contentParts are processed with AI during section generation -- [ ] Pre-extracted JSON flow still works correctly - diff --git a/modules/services/serviceGeneration/paths/codePath.py b/modules/services/serviceGeneration/paths/codePath.py index 715bfeb7..e25cfccc 100644 --- a/modules/services/serviceGeneration/paths/codePath.py +++ b/modules/services/serviceGeneration/paths/codePath.py @@ -67,7 +67,12 @@ class CodeGenerationPath: # Phase 2: Code content generation (with dependency handling) self.services.chat.progressLogUpdate(codeOperationId, 0.5, "Generating code content") - codeFiles = await self._generateCodeContent(codeStructure, codeOperationId) + codeFiles = await self._generateCodeContent( + codeStructure, + codeOperationId, + userPrompt=userPrompt, + contentParts=contentParts + ) # Phase 3: Code formatting & validation self.services.chat.progressLogUpdate(codeOperationId, 0.8, "Formatting code files") @@ -199,86 +204,101 @@ class CodeGenerationPath: ) -> Dict[str, Any]: """Generate code structure using looping system.""" + # Build content parts index (similar to document generation) + contentPartsIndex = "" + if contentParts: + validParts = [] + for part in contentParts: + contentFormat = part.metadata.get("contentFormat", "unknown") + originalFileName = part.metadata.get('originalFileName', 'N/A') + + # Include reference parts and parts with data + if contentFormat == "reference" or (part.data and len(str(part.data).strip()) > 0): + validParts.append(part) + + if validParts: + contentPartsIndex = "\n## AVAILABLE CONTENT PARTS\n" + for i, part in enumerate(validParts, 1): + contentFormat = part.metadata.get("contentFormat", "unknown") + originalFileName = part.metadata.get('originalFileName', 'N/A') + + contentPartsIndex += f"\n{i}. ContentPart ID: {part.id}\n" + contentPartsIndex += f" Format: {contentFormat}\n" + contentPartsIndex += f" Type: {part.typeGroup}\n" + contentPartsIndex += f" MIME Type: {part.mimeType or 'N/A'}\n" + contentPartsIndex += f" Source: {part.metadata.get('documentId', 'unknown')}\n" + contentPartsIndex += f" Original file name: {originalFileName}\n" + contentPartsIndex += f" Usage hint: {part.metadata.get('usageHint', 'N/A')}\n" + + if not contentPartsIndex: + contentPartsIndex = "\n(No content parts available)" + # Build structure generation prompt - structurePrompt = f"""Analyze the following code generation request and create a project structure. + structurePrompt = f"""# TASK: Generate Code Project Structure -Request: {userPrompt} +This is a PLANNING task. Return EXACTLY ONE complete JSON object. Do not generate multiple JSON objects, alternatives, or variations. Do not use separators like "---" between JSON objects. -Language: {language} +## USER REQUEST (for context) +``` +{userPrompt} +``` +{contentPartsIndex} + +## LANGUAGE +{language} + +## TASK DESCRIPTION +Analyze the USER REQUEST above and create a project structure that fulfills ALL requirements mentioned in the request. IMPORTANT: If the request mentions multiple files (e.g., "3 files", "config.json and customers.json", etc.), you MUST include ALL requested files in the files array. Set projectType to "multi_file" when multiple files are requested. +## CONTENT PARTS USAGE (if available) +If AVAILABLE CONTENT PARTS are listed above, use them to inform the file structure: + +**Analyzing Content Parts:** +- Review each ContentPart's format, type, original file name, and usage hint +- Content parts with "reference" format = documents/images that will be processed/extracted +- Content parts with "extracted" format = pre-processed data ready to use +- Content parts with "object" format = images/documents to be displayed or processed + +**Mapping Content Parts to Files:** +- If content parts contain data (e.g., expense receipts, customer lists), create data files (JSON/CSV) that will store/represent that data +- If content parts are documents to be processed (e.g., PDFs), you may need code files that parse/process them +- Use the original file names and usage hints to determine appropriate filenames and file types + +**Populating File Structure Fields:** +- **dependencies**: List file IDs that this file depends on (e.g., if a Python script reads a JSON config file, the script depends on the config file) +- **imports**: For code files, list imports needed based on content parts (e.g., if processing PDFs: ["import PyPDF2"], if processing CSV: ["import csv"], if processing JSON: ["import json"]) +- **functions**: For CODE files only - list function signatures if the USER REQUEST specifies functionality (e.g., {{"name": "parseReceipt", "signature": "def parseReceipt(pdf_path: str) -> dict"}}) +- **classes**: For CODE files only - list class definitions if the USER REQUEST specifies OOP structure +- **functions/classes for DATA files**: Leave as empty arrays [] - data files (JSON/CSV/XML) don't contain executable code + +## FILE STRUCTURE REQUIREMENTS Create a JSON structure with: 1. metadata: {{"language": "{language}", "projectType": "single_file|multi_file", "projectName": "..."}} + - projectName: Derive from USER REQUEST or content parts (e.g., "expense-tracker", "customer-manager") + 2. files: Array of file structures, each with: - id: Unique identifier (e.g., "file_1", "file_2") - - filename: File name (e.g., "config.json", "customers.json", "main.py") - - fileType: File extension (e.g., "json", "py", "js", "csv", "xml") - - dependencies: List of file IDs this file depends on (for multi-file projects) - - imports: List of import statements (for dependency extraction) - - functions: Array of function signatures {{"name": "...", "signature": "..."}} - - classes: Array of class definitions {{"name": "...", "signature": "..."}} + - filename: File name matching USER REQUEST requirements (e.g., "config.json", "customers.json", "expenses.csv") + - fileType: File extension matching the requested format (e.g., "json", "py", "js", "csv", "xml") + - dependencies: List of file IDs this file depends on (for multi-file projects where files reference each other) + - imports: List of import statements that this file will need (e.g., ["import json", "import csv"] for Python files processing JSON/CSV) + - functions: Array of function signatures {{"name": "...", "signature": "..."}} - ONLY if the file will contain executable code (not for pure data files like JSON/CSV) + - classes: Array of class definitions {{"name": "...", "signature": "..."}} - ONLY if the file will contain executable code (not for pure data files like JSON/CSV) + +IMPORTANT FOR DATA FILES (JSON, CSV, XML): +- For pure data files (config.json, customers.json, expenses.csv), leave functions and classes as empty arrays [] +- These files contain structured data, not executable code +- Use imports only if the file will be processed by code (e.g., a Python script that reads the CSV) + +IMPORTANT FOR CODE FILES (Python, JavaScript, etc.): +- Include functions/classes if the USER REQUEST specifies functionality +- Use dependencies to indicate which data files this code file reads/processes +- Use imports to specify what libraries/modules are needed For single-file projects, return one file. For multi-file projects, include ALL requested files in the files array. -Example for single file: -{{ - "metadata": {{ - "language": "{language}", - "projectType": "single_file", - "projectName": "generated-project" - }}, - "files": [ - {{ - "id": "file_1", - "filename": "config.json", - "fileType": "json", - "dependencies": [], - "imports": [], - "functions": [], - "classes": [] - }} - ] -}} - -Example for multiple files: -{{ - "metadata": {{ - "language": "{language}", - "projectType": "multi_file", - "projectName": "generated-project" - }}, - "files": [ - {{ - "id": "file_1", - "filename": "config.json", - "fileType": "json", - "dependencies": [], - "imports": [], - "functions": [], - "classes": [] - }}, - {{ - "id": "file_2", - "filename": "customers.json", - "fileType": "json", - "dependencies": [], - "imports": [], - "functions": [], - "classes": [] - }}, - {{ - "id": "file_3", - "filename": "settings.json", - "fileType": "json", - "dependencies": [], - "imports": [], - "functions": [], - "classes": [] - }} - ] -}} - Return ONLY valid JSON matching the request above. """ @@ -304,7 +324,9 @@ Return ONLY valid JSON matching the request above. async def _generateCodeContent( self, codeStructure: Dict[str, Any], - parentOperationId: str + parentOperationId: str, + userPrompt: str = None, + contentParts: Optional[List[ContentPart]] = None ) -> List[Dict[str, Any]]: """Generate code content for each file with dependency handling.""" files = codeStructure.get("files", []) @@ -340,7 +362,9 @@ Return ONLY valid JSON matching the request above. fileStructure, fileContext=fileContext, allFilesStructure=orderedFiles, - metadata=metadata + metadata=metadata, + userPrompt=userPrompt, + contentParts=contentParts ) codeFiles.append(fileContent) @@ -546,7 +570,9 @@ Return ONLY valid JSON matching the request above. fileStructure: Dict[str, Any], fileContext: Dict[str, Any] = None, allFilesStructure: List[Dict[str, Any]] = None, - metadata: Dict[str, Any] = None + metadata: Dict[str, Any] = None, + userPrompt: str = None, + contentParts: Optional[List[ContentPart]] = None ) -> Dict[str, Any]: """Generate code content for a single file with context about other files.""" @@ -573,10 +599,55 @@ Return ONLY valid JSON matching the request above. contextInfo += ", ".join(exports) contextInfo += "\n" - contentPrompt = f"""Generate complete, executable code for the file: {filename} + # Build content parts section if available + contentPartsSection = "" + if contentParts: + relevantParts = [] + for part in contentParts: + # Include parts that might be relevant to this file + usageHint = part.metadata.get('usageHint', '').lower() + originalFileName = part.metadata.get('originalFileName', '').lower() + filenameLower = filename.lower() + + # Check if this content part is relevant to this file + if (filenameLower in usageHint or + filenameLower in originalFileName or + part.metadata.get('contentFormat') == 'reference' or + (part.data and len(str(part.data).strip()) > 0)): + relevantParts.append(part) + + if relevantParts: + contentPartsSection = "\n## AVAILABLE CONTENT PARTS\n" + for i, part in enumerate(relevantParts, 1): + contentFormat = part.metadata.get("contentFormat", "unknown") + originalFileName = part.metadata.get('originalFileName', 'N/A') + contentPartsSection += f"\n{i}. ContentPart ID: {part.id}\n" + contentPartsSection += f" Format: {contentFormat}\n" + contentPartsSection += f" Type: {part.typeGroup}\n" + contentPartsSection += f" Original file name: {originalFileName}\n" + contentPartsSection += f" Usage hint: {part.metadata.get('usageHint', 'N/A')}\n" + # Include actual content if it's small enough (for data files like CSV, JSON) + if part.data and isinstance(part.data, str) and len(part.data) < 2000: + contentPartsSection += f" Content preview: {part.data[:500]}...\n" + + # Build user request section + userRequestSection = "" + if userPrompt: + userRequestSection = f""" +## ORIGINAL USER REQUEST +``` +{userPrompt} +``` +""" + + contentPrompt = f"""# TASK: Generate Code File Content + +Generate complete, executable code for the file: {filename} +{userRequestSection}## FILE SPECIFICATIONS File Type: {fileType} Language: {metadata.get('language', 'python') if metadata else 'python'} +{contentPartsSection} Required functions: {json.dumps(functions, indent=2) if functions else 'None specified'} diff --git a/modules/services/serviceGeneration/paths/documentPath.py b/modules/services/serviceGeneration/paths/documentPath.py index 94c4fc41..72838918 100644 --- a/modules/services/serviceGeneration/paths/documentPath.py +++ b/modules/services/serviceGeneration/paths/documentPath.py @@ -9,6 +9,7 @@ Handles document generation using existing chapter/section model. import json import logging import time +import copy from typing import Dict, Any, List, Optional from modules.datamodels.datamodelWorkflow import AiResponse, AiResponseMetadata, DocumentData from modules.datamodels.datamodelExtraction import ContentPart, DocumentIntent @@ -153,6 +154,11 @@ class DocumentGenerationPath: # Use validated currentUserLanguage as global fallback (always valid infrastructure) language = self.services.currentUserLanguage if hasattr(self.services, 'currentUserLanguage') and self.services.currentUserLanguage else "en" + # IMPORTANT: Create deep copy BEFORE renderResult to preserve filledStructure with elements + # renderResult might modify the structure, so we need to preserve the original for sourceJson + # This ensures sourceJson contains the complete structure with elements for validation + filledStructureForSourceJson = copy.deepcopy(filledStructure) if filledStructure else None + renderedDocuments = await self.services.ai.renderResult( filledStructure, outputFormat, @@ -167,11 +173,12 @@ class DocumentGenerationPath: for renderedDoc in renderedDocuments: try: # Erstelle DocumentData für jedes gerenderte Dokument + # Use the preserved filledStructureForSourceJson (with elements) for sourceJson docDataObj = DocumentData( documentName=renderedDoc.filename, documentData=renderedDoc.documentData, mimeType=renderedDoc.mimeType, - sourceJson=filledStructure if len(documentDataList) == 0 else None # Nur für erstes Dokument + sourceJson=filledStructureForSourceJson if len(documentDataList) == 0 else None # Nur für erstes Dokument ) documentDataList.append(docDataObj) logger.debug(f"Added rendered document: {renderedDoc.filename} ({len(renderedDoc.documentData)} bytes, {renderedDoc.mimeType})") diff --git a/modules/services/serviceGeneration/renderers/documentRendererBaseTemplate.py b/modules/services/serviceGeneration/renderers/documentRendererBaseTemplate.py index 0c72bd24..d0558183 100644 --- a/modules/services/serviceGeneration/renderers/documentRendererBaseTemplate.py +++ b/modules/services/serviceGeneration/renderers/documentRendererBaseTemplate.py @@ -63,6 +63,27 @@ class BaseRenderer(ABC): """ return 'document' # Default to document style + @classmethod + def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: + """ + Return list of section content types that this renderer accepts. + This allows renderers to declare which section types they can process. + + Default implementation returns all supported section types. + Override this method in subclasses to restrict accepted types. + + Args: + formatName: Optional format name (e.g., 'txt', 'js', 'csv') - useful for renderers + that handle multiple formats with different accepted types (e.g., RendererText) + + Returns: + List of accepted section content types (e.g., ["table", "paragraph", "heading"]) + Valid types: "table", "bullet_list", "heading", "paragraph", "code_block", "image" + """ + # Default: accept all section types + from modules.datamodels.datamodelJson import supportedSectionTypes + return list(supportedSectionTypes) + @abstractmethod async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """ diff --git a/modules/services/serviceGeneration/renderers/rendererCodeCsv.py b/modules/services/serviceGeneration/renderers/rendererCodeCsv.py index 3cfc6a52..962b8f04 100644 --- a/modules/services/serviceGeneration/renderers/rendererCodeCsv.py +++ b/modules/services/serviceGeneration/renderers/rendererCodeCsv.py @@ -57,12 +57,23 @@ class RendererCodeCsv(BaseCodeRenderer): # Validate CSV structure (header row, consistent columns) validatedContent = self._validateAndFixCsv(content) + # Extract CSV statistics for validation + csvStats = self._extractCsvStatistics(validatedContent) + + # Merge file-specific metadata with project metadata + fileMetadata = dict(metadata) if metadata else {} + fileMetadata.update({ + "filename": filename, + "fileType": "csv", + "statistics": csvStats + }) + renderedDocs.append( RenderedDocument( documentData=validatedContent.encode('utf-8'), mimeType="text/csv", filename=filename, - metadata=metadata + metadata=fileMetadata ) ) @@ -104,7 +115,7 @@ class RendererCodeCsv(BaseCodeRenderer): for i, row in enumerate(rows[1:], 1): if len(row) != headerCount: - self.logger.warning(f"Row {i} has {len(row)} columns, expected {headerCount}. Fixing...") + self.logger.debug(f"Row {i} has {len(row)} columns, expected {headerCount}. Auto-fixing...") # Pad or truncate to match header if len(row) < headerCount: row.extend([''] * (headerCount - len(row))) @@ -123,3 +134,26 @@ class RendererCodeCsv(BaseCodeRenderer): except Exception as e: self.logger.warning(f"CSV validation failed: {e}, returning original content") return content + + def _extractCsvStatistics(self, content: str) -> Dict[str, Any]: + """Extract CSV statistics for validation (row count, column count, headers).""" + try: + reader = csv.reader(io.StringIO(content)) + rows = list(reader) + + if not rows: + return {"rowCount": 0, "columnCount": 0, "headerRow": []} + + headerRow = rows[0] + columnCount = len(headerRow) + rowCount = len(rows) - 1 # Exclude header + + return { + "rowCount": rowCount, + "columnCount": columnCount, + "headerRow": headerRow, + "dataRowCount": rowCount + } + except Exception as e: + self.logger.warning(f"CSV statistics extraction failed: {e}") + return {} diff --git a/modules/services/serviceGeneration/renderers/rendererCodeJson.py b/modules/services/serviceGeneration/renderers/rendererCodeJson.py index e4e4a207..924ba861 100644 --- a/modules/services/serviceGeneration/renderers/rendererCodeJson.py +++ b/modules/services/serviceGeneration/renderers/rendererCodeJson.py @@ -53,26 +53,39 @@ class RendererCodeJson(BaseCodeRenderer): filename = codeFile['filename'] content = codeFile['content'] - # Validate JSON syntax + # Validate JSON syntax and extract statistics + parsed = None try: - json.loads(content) # Validate JSON + parsed = json.loads(content) # Validate JSON except json.JSONDecodeError as e: self.logger.warning(f"Invalid JSON in {filename}: {e}") # Could fix/format JSON here if needed # Format JSON (pretty print) try: - parsed = json.loads(content) + if parsed is None: + parsed = json.loads(content) formattedContent = json.dumps(parsed, indent=2, ensure_ascii=False) except Exception: formattedContent = content # Use original if formatting fails + # Extract JSON statistics for validation + jsonStats = self._extractJsonStatistics(parsed) if parsed else {} + + # Merge file-specific metadata with project metadata + fileMetadata = dict(metadata) if metadata else {} + fileMetadata.update({ + "filename": filename, + "fileType": "json", + "statistics": jsonStats + }) + renderedDocs.append( RenderedDocument( documentData=formattedContent.encode('utf-8'), mimeType="application/json", filename=filename, - metadata=metadata + metadata=fileMetadata ) ) @@ -95,3 +108,34 @@ class RendererCodeJson(BaseCodeRenderer): from .rendererJson import RendererJson documentRenderer = RendererJson(self.services) return await documentRenderer.render(extractedContent, title, userPrompt, aiService) + + def _extractJsonStatistics(self, parsed: Any) -> Dict[str, Any]: + """Extract JSON statistics for validation (object count, array count, key count).""" + try: + stats = { + "isArray": isinstance(parsed, list), + "isObject": isinstance(parsed, dict), + "itemCount": 0, + "keyCount": 0 + } + + if isinstance(parsed, list): + stats["itemCount"] = len(parsed) + # Count nested objects/arrays + objectCount = sum(1 for item in parsed if isinstance(item, dict)) + arrayCount = sum(1 for item in parsed if isinstance(item, list)) + stats["objectCount"] = objectCount + stats["arrayCount"] = arrayCount + elif isinstance(parsed, dict): + stats["keyCount"] = len(parsed) + stats["keys"] = list(parsed.keys()) + # Count nested objects/arrays + objectCount = sum(1 for v in parsed.values() if isinstance(v, dict)) + arrayCount = sum(1 for v in parsed.values() if isinstance(v, list)) + stats["objectCount"] = objectCount + stats["arrayCount"] = arrayCount + + return stats + except Exception as e: + self.logger.warning(f"JSON statistics extraction failed: {e}") + return {} diff --git a/modules/services/serviceGeneration/renderers/rendererCodeXml.py b/modules/services/serviceGeneration/renderers/rendererCodeXml.py index 18bf8ab1..edab8f8e 100644 --- a/modules/services/serviceGeneration/renderers/rendererCodeXml.py +++ b/modules/services/serviceGeneration/renderers/rendererCodeXml.py @@ -56,12 +56,23 @@ class RendererCodeXml(BaseCodeRenderer): # Validate and format XML formattedContent = self._validateAndFormatXml(content) + # Extract XML statistics for validation + xmlStats = self._extractXmlStatistics(formattedContent) + + # Merge file-specific metadata with project metadata + fileMetadata = dict(metadata) if metadata else {} + fileMetadata.update({ + "filename": filename, + "fileType": "xml", + "statistics": xmlStats + }) + renderedDocs.append( RenderedDocument( documentData=formattedContent.encode('utf-8'), mimeType="application/xml", filename=filename, - metadata=metadata + metadata=fileMetadata ) ) @@ -111,3 +122,27 @@ class RendererCodeXml(BaseCodeRenderer): except Exception as e: self.logger.warning(f"XML formatting failed: {e}, returning original content") return content + + def _extractXmlStatistics(self, content: str) -> Dict[str, Any]: + """Extract XML statistics for validation (element count, attribute count, root element).""" + try: + root = ET.fromstring(content) + + # Count all elements recursively + elementCount = len(list(root.iter())) + + # Count attributes + attributeCount = sum(len(elem.attrib) for elem in root.iter()) + + # Get root element name + rootElement = root.tag + + return { + "elementCount": elementCount, + "attributeCount": attributeCount, + "rootElement": rootElement, + "hasRoot": True + } + except Exception as e: + self.logger.warning(f"XML statistics extraction failed: {e}") + return {} diff --git a/modules/services/serviceGeneration/renderers/rendererCsv.py b/modules/services/serviceGeneration/renderers/rendererCsv.py index 0356c997..45871922 100644 --- a/modules/services/serviceGeneration/renderers/rendererCsv.py +++ b/modules/services/serviceGeneration/renderers/rendererCsv.py @@ -28,45 +28,131 @@ class RendererCsv(BaseRenderer): @classmethod def getOutputStyle(cls, formatName: Optional[str] = None) -> str: - """Return output style classification: CSV requires specific structure (header, then data rows).""" - return 'code' + """Return output style classification: CSV document renderer converts structured document content to CSV.""" + return 'document' + + @classmethod + def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: + """ + Return list of section content types that CSV renderer accepts. + CSV renderer only accepts table sections. + """ + return ["table"] async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: - """Render extracted JSON content to CSV format.""" + """Render extracted JSON content to CSV format. Produces one CSV file per table section.""" try: - # Generate CSV directly from JSON (no styling needed for CSV) - csvContent = await self._generateCsvFromJson(extractedContent, title) + # Validate JSON structure + if not self._validateJsonStructure(extractedContent): + raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}") - # Determine filename from document or title + # Extract sections and metadata + sections = self._extractSections(extractedContent) + metadata = self._extractMetadata(extractedContent) + + # Determine base filename from document or title documents = extractedContent.get("documents", []) + baseFilename = None if documents and isinstance(documents[0], dict): - filename = documents[0].get("filename") - if not filename: - filename = self._determineFilename(title, "text/csv") - else: - filename = self._determineFilename(title, "text/csv") + baseFilename = documents[0].get("filename") + if not baseFilename: + baseFilename = self._determineFilename(title, "text/csv") - # Extract metadata for document type and other info - metadata = extractedContent.get("metadata", {}) if extractedContent else {} - documentType = metadata.get("documentType") if isinstance(metadata, dict) else None + # Remove extension from base filename if present + if baseFilename.endswith('.csv'): + baseFilename = baseFilename[:-4] - return [ - RenderedDocument( - documentData=csvContent.encode('utf-8'), - mimeType="text/csv", - filename=filename, - documentType=documentType, - metadata=metadata if isinstance(metadata, dict) else None + # Find all table sections + tableSections = [] + for section in sections: + sectionType = section.get("content_type", "paragraph") + if sectionType == "table": + tableSections.append(section) + + # If no table sections found, return empty CSV + if not tableSections: + self.logger.warning("No table sections found in CSV document - returning empty CSV") + emptyCsv = self._convertRowsToCsv([["No table data available"]]) + return [ + RenderedDocument( + documentData=emptyCsv.encode('utf-8'), + mimeType="text/csv", + filename=self._determineFilename(title, "text/csv"), + documentType=metadata.get("documentType") if isinstance(metadata, dict) else None, + metadata=metadata if isinstance(metadata, dict) else None + ) + ] + + # Generate one CSV file per table section + renderedDocuments = [] + for i, tableSection in enumerate(tableSections): + # Generate CSV content for this table section + csvRows = [] + + # Add section title if available + sectionTitle = tableSection.get("title") + if sectionTitle: + csvRows.append([sectionTitle]) + csvRows.append([]) # Empty row after title + + # Render table from section elements + elements = tableSection.get("elements", []) + for element in elements: + tableRows = self._renderJsonTableToCsv(element) + if tableRows: + csvRows.extend(tableRows) + + # Convert to CSV string + csvContent = self._convertRowsToCsv(csvRows) + + # Determine filename for this table + if len(tableSections) == 1: + # Single table - use base filename + filename = f"{baseFilename}.csv" + else: + # Multiple tables - add index or section title to filename + sectionId = tableSection.get("id", f"table_{i+1}") + # Use section title if available, otherwise use section ID + if sectionTitle: + # Sanitize section title for filename + safeTitle = "".join(c for c in sectionTitle if c.isalnum() or c in (' ', '-', '_')).strip() + safeTitle = safeTitle.replace(' ', '_')[:30] # Limit length + filename = f"{baseFilename}_{safeTitle}.csv" + else: + filename = f"{baseFilename}_{sectionId}.csv" + + # Extract document type from metadata + documentType = metadata.get("documentType") if isinstance(metadata, dict) else None + + renderedDocuments.append( + RenderedDocument( + documentData=csvContent.encode('utf-8'), + mimeType="text/csv", + filename=filename, + documentType=documentType, + metadata=metadata if isinstance(metadata, dict) else None + ) ) - ] + + return renderedDocuments except Exception as e: self.logger.error(f"Error rendering CSV: {str(e)}") # Return minimal CSV fallback - return f"Title,Content\n{title},Error rendering report: {str(e)}", "text/csv" + fallbackCsv = self._convertRowsToCsv([["Title", "Content"], [title, f"Error rendering report: {str(e)}"]]) + return [ + RenderedDocument( + documentData=fallbackCsv.encode('utf-8'), + mimeType="text/csv", + filename=self._determineFilename(title, "text/csv"), + metadata=extractedContent.get("metadata", {}) if extractedContent else None + ) + ] async def _generateCsvFromJson(self, jsonContent: Dict[str, Any], title: str) -> str: - """Generate CSV content from structured JSON document.""" + """Generate CSV content from structured JSON document. DEPRECATED: Use render() method instead.""" + # This method is kept for backward compatibility but is no longer used + # The render() method now handles CSV generation directly try: # Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]}) if not self._validateJsonStructure(jsonContent): @@ -88,12 +174,14 @@ class RendererCsv(BaseRenderer): csvRows.append([documentTitle]) csvRows.append([]) # Empty row - # Process each section in order + # Process each section in order - only table sections for section in sections: - sectionCsv = self._renderJsonSectionToCsv(section) - if sectionCsv: - csvRows.extend(sectionCsv) - csvRows.append([]) # Empty row between sections + sectionType = section.get("content_type", "paragraph") + if sectionType == "table": + sectionCsv = self._renderJsonSectionToCsv(section) + if sectionCsv: + csvRows.extend(sectionCsv) + csvRows.append([]) # Empty row between sections # Convert to CSV string csvContent = self._convertRowsToCsv(csvRows) @@ -309,3 +397,4 @@ class RendererCsv(BaseRenderer): content = '\n'.join(lines[1:-1]).strip() return content + diff --git a/modules/services/serviceGeneration/renderers/rendererDocx.py b/modules/services/serviceGeneration/renderers/rendererDocx.py index e114d286..a53a806a 100644 --- a/modules/services/serviceGeneration/renderers/rendererDocx.py +++ b/modules/services/serviceGeneration/renderers/rendererDocx.py @@ -44,6 +44,15 @@ class RendererDocx(BaseRenderer): """Return output style classification: Word documents are formatted documents.""" return 'document' + @classmethod + def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: + """ + Return list of section content types that DOCX renderer accepts. + DOCX renderer accepts all section types (Word documents can contain all content types). + """ + from modules.datamodels.datamodelJson import supportedSectionTypes + return list(supportedSectionTypes) + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """Render extracted JSON content to DOCX format using AI-analyzed styling.""" self.services.utils.debugLogToFile(f"DOCX RENDER CALLED: title={title}, user_prompt={userPrompt[:50] if userPrompt else 'None'}...", "DOCX_RENDERER") @@ -299,6 +308,9 @@ class RendererDocx(BaseRenderer): # Process each element in the section for element in elements: + # Skip non-dict elements (e.g., int, str, etc.) + if not isinstance(element, dict): + continue element_type = element.get("type", "") # Support three content formats from Phase 5D diff --git a/modules/services/serviceGeneration/renderers/rendererHtml.py b/modules/services/serviceGeneration/renderers/rendererHtml.py index 618ffab4..58143ac2 100644 --- a/modules/services/serviceGeneration/renderers/rendererHtml.py +++ b/modules/services/serviceGeneration/renderers/rendererHtml.py @@ -31,6 +31,15 @@ class RendererHtml(BaseRenderer): """Return output style classification: HTML web pages are rendered documents.""" return 'document' + @classmethod + def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: + """ + Return list of section content types that HTML renderer accepts. + HTML renderer accepts all section types (HTML pages can contain all content types including images). + """ + from modules.datamodels.datamodelJson import supportedSectionTypes + return list(supportedSectionTypes) + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """ Render HTML document with images as separate files. diff --git a/modules/services/serviceGeneration/renderers/rendererImage.py b/modules/services/serviceGeneration/renderers/rendererImage.py index 8d00b7fb..2aff559f 100644 --- a/modules/services/serviceGeneration/renderers/rendererImage.py +++ b/modules/services/serviceGeneration/renderers/rendererImage.py @@ -35,6 +35,14 @@ class RendererImage(BaseRenderer): """Return output style classification: Images are visual media.""" return 'image' + @classmethod + def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: + """ + Return list of section content types that Image renderer accepts. + Image renderer only accepts image sections (images are generated from image sections). + """ + return ["image"] + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """Render extracted JSON content to image format using AI image generation.""" try: diff --git a/modules/services/serviceGeneration/renderers/rendererJson.py b/modules/services/serviceGeneration/renderers/rendererJson.py index b0b8d706..076210bc 100644 --- a/modules/services/serviceGeneration/renderers/rendererJson.py +++ b/modules/services/serviceGeneration/renderers/rendererJson.py @@ -29,8 +29,18 @@ class RendererJson(BaseRenderer): @classmethod def getOutputStyle(cls, formatName: Optional[str] = None) -> str: - """Return output style classification: JSON is structured data format.""" - return 'code' + """Return output style classification: JSON document renderer converts structured document content to JSON.""" + return 'document' + + @classmethod + def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: + """ + Return list of section content types that JSON renderer accepts. + JSON renderer accepts all section types except images (images cannot be serialized to JSON). + """ + from modules.datamodels.datamodelJson import supportedSectionTypes + # Return all types except image + return [st for st in supportedSectionTypes if st != "image"] async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """Render extracted JSON content to JSON format.""" diff --git a/modules/services/serviceGeneration/renderers/rendererMarkdown.py b/modules/services/serviceGeneration/renderers/rendererMarkdown.py index 61b325e1..048e95b3 100644 --- a/modules/services/serviceGeneration/renderers/rendererMarkdown.py +++ b/modules/services/serviceGeneration/renderers/rendererMarkdown.py @@ -31,6 +31,15 @@ class RendererMarkdown(BaseRenderer): """Return output style classification: Markdown documents are formatted documents.""" return 'document' + @classmethod + def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: + """ + Return list of section content types that Markdown renderer accepts. + Markdown renderer accepts all section types (Markdown can represent all content types). + """ + from modules.datamodels.datamodelJson import supportedSectionTypes + return list(supportedSectionTypes) + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """Render extracted JSON content to Markdown format.""" try: diff --git a/modules/services/serviceGeneration/renderers/rendererPdf.py b/modules/services/serviceGeneration/renderers/rendererPdf.py index ff7379d9..6cbc8a9c 100644 --- a/modules/services/serviceGeneration/renderers/rendererPdf.py +++ b/modules/services/serviceGeneration/renderers/rendererPdf.py @@ -44,6 +44,15 @@ class RendererPdf(BaseRenderer): """Return output style classification: PDF documents are formatted documents.""" return 'document' + @classmethod + def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: + """ + Return list of section content types that PDF renderer accepts. + PDF renderer accepts all section types (PDF documents can contain all content types). + """ + from modules.datamodels.datamodelJson import supportedSectionTypes + return list(supportedSectionTypes) + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """Render extracted JSON content to PDF format using AI-analyzed styling.""" try: diff --git a/modules/services/serviceGeneration/renderers/rendererPptx.py b/modules/services/serviceGeneration/renderers/rendererPptx.py index a47257dc..9ada961a 100644 --- a/modules/services/serviceGeneration/renderers/rendererPptx.py +++ b/modules/services/serviceGeneration/renderers/rendererPptx.py @@ -41,6 +41,15 @@ class RendererPptx(BaseRenderer): """Return output style classification: PowerPoint presentations are formatted documents.""" return 'document' + @classmethod + def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: + """ + Return list of section content types that PowerPoint renderer accepts. + PowerPoint renderer accepts all section types (presentations can contain all content types including images). + """ + from modules.datamodels.datamodelJson import supportedSectionTypes + return list(supportedSectionTypes) + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """ Render content as PowerPoint presentation from JSON data. diff --git a/modules/services/serviceGeneration/renderers/rendererText.py b/modules/services/serviceGeneration/renderers/rendererText.py index 916251ba..ed588c62 100644 --- a/modules/services/serviceGeneration/renderers/rendererText.py +++ b/modules/services/serviceGeneration/renderers/rendererText.py @@ -63,6 +63,22 @@ class RendererText(BaseRenderer): # All other formats handled by RendererText are code style return 'code' + @classmethod + def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: + """ + Return list of section content types that Text renderer accepts. + For plain text formats (txt/text/plain): accepts all section types. + For code formats: accepts all section types except images (code files don't typically contain images). + """ + from modules.datamodels.datamodelJson import supportedSectionTypes + + # Plain text formats accept all types + if formatName and formatName.lower() in ['txt', 'text', 'plain']: + return list(supportedSectionTypes) + + # Code formats accept all types except images + return [st for st in supportedSectionTypes if st != "image"] + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """Render extracted JSON content to plain text format.""" try: diff --git a/modules/services/serviceGeneration/renderers/rendererXlsx.py b/modules/services/serviceGeneration/renderers/rendererXlsx.py index 3050e5f1..750f4eb0 100644 --- a/modules/services/serviceGeneration/renderers/rendererXlsx.py +++ b/modules/services/serviceGeneration/renderers/rendererXlsx.py @@ -48,6 +48,15 @@ class RendererXlsx(BaseRenderer): """Return output style classification: Excel spreadsheets are formatted documents.""" return 'document' + @classmethod + def getAcceptedSectionTypes(cls, formatName: Optional[str] = None) -> List[str]: + """ + Return list of section content types that Excel renderer accepts. + Excel renderer accepts all section types (spreadsheets can contain tables, text, headings, etc.). + """ + from modules.datamodels.datamodelJson import supportedSectionTypes + return list(supportedSectionTypes) + async def render(self, extractedContent: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> List[RenderedDocument]: """Render extracted JSON content to Excel format using AI-analyzed styling.""" try: diff --git a/modules/workflows/processing/adaptive/contentValidator.py b/modules/workflows/processing/adaptive/contentValidator.py index 32f9c528..45cfa342 100644 --- a/modules/workflows/processing/adaptive/contentValidator.py +++ b/modules/workflows/processing/adaptive/contentValidator.py @@ -80,46 +80,64 @@ class ContentValidator: # For tables: extract caption and statistics if section.get("content_type") == "table": + # Try to extract from elements first if elements and isinstance(elements, list) and len(elements) > 0: tableElement = elements[0] - content = tableElement.get("content", {}) - if isinstance(content, dict): - headers = content.get("headers", []) - rows = content.get("rows", []) - else: - headers = tableElement.get("headers", []) - rows = tableElement.get("rows", []) - if headers: - sectionSummary["columnCount"] = len(headers) - sectionSummary["headers"] = headers # Include headers for context - if rows: - sectionSummary["rowCount"] = len(rows) - sectionSummary["caption"] = tableElement.get("caption") or (content.get("caption") if isinstance(content, dict) else None) + # Ensure tableElement is a dictionary before accessing + if isinstance(tableElement, dict): + content = tableElement.get("content", {}) + if isinstance(content, dict): + headers = content.get("headers", []) + rows = content.get("rows", []) + else: + headers = tableElement.get("headers", []) + rows = tableElement.get("rows", []) + if headers: + sectionSummary["columnCount"] = len(headers) + sectionSummary["headers"] = headers # Include headers for context + if rows: + sectionSummary["rowCount"] = len(rows) + sectionSummary["caption"] = tableElement.get("caption") or (content.get("caption") if isinstance(content, dict) else None) + else: + # Fallback: extract KPIs from section metadata if elements are missing + # This handles cases where filledStructure doesn't have elements populated + if "columnCount" in section: + sectionSummary["columnCount"] = section.get("columnCount") + if "rowCount" in section: + sectionSummary["rowCount"] = section.get("rowCount") + if "headers" in section: + sectionSummary["headers"] = section.get("headers") + if "caption" in section: + sectionSummary["caption"] = section.get("caption") # For lists and bullet_lists: extract item count elif section.get("content_type") in ["list", "bullet_list"]: if elements and isinstance(elements, list) and len(elements) > 0: listElement = elements[0] - content = listElement.get("content", {}) - if isinstance(content, dict): - items = content.get("items", []) - else: - items = listElement.get("items", []) - if items: - sectionSummary["itemCount"] = len(items) + # Ensure listElement is a dictionary before accessing + if isinstance(listElement, dict): + content = listElement.get("content", {}) + if isinstance(content, dict): + items = content.get("items", []) + else: + items = listElement.get("items", []) + if items: + sectionSummary["itemCount"] = len(items) # For paragraphs/headings: extract text statistics (no preview for security) elif section.get("content_type") in ["paragraph", "heading"]: if elements and isinstance(elements, list) and len(elements) > 0: textElement = elements[0] - content = textElement.get("content", {}) - if isinstance(content, dict): - text = content.get("text", "") - else: - text = textElement.get("text", "") - if text: - sectionSummary["textLength"] = len(text) - sectionSummary["wordCount"] = len(text.split()) + # Ensure textElement is a dictionary before accessing + if isinstance(textElement, dict): + content = textElement.get("content", {}) + if isinstance(content, dict): + text = content.get("text", "") + else: + text = textElement.get("text", "") + if text: + sectionSummary["textLength"] = len(text) + sectionSummary["wordCount"] = len(text.split()) # Also check for text length if available directly in section if section.get("textLength"): sectionSummary["textLength"] = section.get("textLength") @@ -198,39 +216,61 @@ class ContentValidator: elements = section.get("elements", []) if section.get("content_type") == "table": + # Try to extract from elements first if elements and isinstance(elements, list) and len(elements) > 0: tableElement = elements[0] - content = tableElement.get("content", {}) - if isinstance(content, dict): - headers = content.get("headers", []) - rows = content.get("rows", []) - else: - headers = tableElement.get("headers", []) - rows = tableElement.get("rows", []) - if headers: - sectionSummary["columnCount"] = len(headers) - sectionSummary["headers"] = headers - if rows: - sectionSummary["rowCount"] = len(rows) - sectionSummary["caption"] = tableElement.get("caption") or (content.get("caption") if isinstance(content, dict) else None) + # Ensure tableElement is a dictionary before accessing + if isinstance(tableElement, dict): + content = tableElement.get("content", {}) + if isinstance(content, dict): + headers = content.get("headers", []) + rows = content.get("rows", []) + else: + headers = tableElement.get("headers", []) + rows = tableElement.get("rows", []) + if headers: + sectionSummary["columnCount"] = len(headers) + sectionSummary["headers"] = headers + if rows: + sectionSummary["rowCount"] = len(rows) + sectionSummary["caption"] = tableElement.get("caption") or (content.get("caption") if isinstance(content, dict) else None) + else: + # Fallback: extract KPIs from section metadata if elements are missing + # This handles cases where filledStructure doesn't have elements populated + if "columnCount" in section: + sectionSummary["columnCount"] = section.get("columnCount") + if "rowCount" in section: + sectionSummary["rowCount"] = section.get("rowCount") + if "headers" in section: + sectionSummary["headers"] = section.get("headers") + if "caption" in section: + sectionSummary["caption"] = section.get("caption") # For lists and bullet_lists: extract item count elif section.get("content_type") in ["list", "bullet_list"]: if elements and isinstance(elements, list) and len(elements) > 0: listElement = elements[0] - content = listElement.get("content", {}) - if isinstance(content, dict): - items = content.get("items", []) - else: - items = listElement.get("items", []) - if items: - sectionSummary["itemCount"] = len(items) + # Ensure listElement is a dictionary before accessing + if isinstance(listElement, dict): + content = listElement.get("content", {}) + if isinstance(content, dict): + items = content.get("items", []) + else: + items = listElement.get("items", []) + if items: + sectionSummary["itemCount"] = len(items) + else: + # Fallback: extract KPIs from section metadata if elements are missing + if "itemCount" in section: + sectionSummary["itemCount"] = section.get("itemCount") # For paragraphs/headings: extract text statistics (no preview for security) elif section.get("content_type") in ["paragraph", "heading"]: if elements and isinstance(elements, list) and len(elements) > 0: textElement = elements[0] - content = textElement.get("content", {}) + # Ensure textElement is a dictionary before accessing + if isinstance(textElement, dict): + content = textElement.get("content", {}) if isinstance(content, dict): text = content.get("text", "") else: @@ -341,11 +381,22 @@ class ContentValidator: # NOT the actual rendered content. The actual content is in documentData. # Include both: jsonStructure for structure metadata, and contentPreview for actual content check if sourceJson and isinstance(sourceJson, dict): - # Use source JSON for structure analysis (for rendered documents like xlsx/docx/pdf) - jsonSummary = self._summarizeJsonStructure(sourceJson) - summary["jsonStructure"] = jsonSummary - # Add note that this is metadata, not actual content - summary["note"] = "jsonStructure contains metadata about document structure. Actual rendered content is in documentData." + # Check if this is code generation metadata (has statistics field) + if "statistics" in sourceJson and "fileType" in sourceJson: + # Code generation format - extract statistics from metadata + codeStats = sourceJson.get("statistics", {}) + jsonSummary = { + "metadata": sourceJson, + "sections": [], + "statistics": codeStats + } + summary["jsonStructure"] = jsonSummary + summary["note"] = "jsonStructure contains metadata and statistics for code generation file. Actual rendered content is in documentData." + else: + # Document generation format - use standard structure analysis + jsonSummary = self._summarizeJsonStructure(sourceJson) + summary["jsonStructure"] = jsonSummary + summary["note"] = "jsonStructure contains metadata about document structure. Actual rendered content is in documentData." # For rendered documents, also check actual content if data is not None: @@ -353,8 +404,19 @@ class ContentValidator: if contentPreview: summary["contentPreview"] = contentPreview elif data is not None: + # For code generation files without sourceJson, extract statistics from content + if formatExt in ["csv", "json", "xml"]: + codeStats = self._extractCodeFileStatistics(data, formatExt, mimeType) + if codeStats: + jsonSummary = { + "metadata": {}, + "sections": [], + "statistics": codeStats + } + summary["jsonStructure"] = jsonSummary + summary["note"] = "jsonStructure contains statistics extracted from code file content." # Fallback: try to parse documentData as JSON (for non-rendered documents) - if isinstance(data, dict): + elif isinstance(data, dict): # Summarize JSON structure jsonSummary = self._summarizeJsonStructure(data) summary["jsonStructure"] = jsonSummary @@ -502,6 +564,74 @@ class ContentValidator: logger.warning(f"Error getting content structure info: {str(e)}") return None + def _extractCodeFileStatistics(self, data: Any, formatExt: str, mimeType: str) -> Optional[Dict[str, Any]]: + """Extract statistics from code generation files (CSV, JSON, XML) for validation.""" + try: + # Convert bytes to string if needed + content = None + if isinstance(data, bytes): + try: + content = data.decode('utf-8') + except UnicodeDecodeError: + return None + elif isinstance(data, str): + content = data + else: + return None + + if not content: + return None + + stats = {} + + if formatExt == "csv": + import csv + import io + try: + reader = csv.reader(io.StringIO(content)) + rows = list(reader) + if rows: + headerRow = rows[0] + stats["rowCount"] = len(rows) - 1 # Exclude header + stats["columnCount"] = len(headerRow) + stats["headerRow"] = headerRow + stats["dataRowCount"] = len(rows) - 1 + except Exception as e: + logger.debug(f"CSV statistics extraction failed: {e}") + + elif formatExt == "json": + try: + parsed = json.loads(content) + stats["isArray"] = isinstance(parsed, list) + stats["isObject"] = isinstance(parsed, dict) + if isinstance(parsed, list): + stats["itemCount"] = len(parsed) + stats["objectCount"] = sum(1 for item in parsed if isinstance(item, dict)) + stats["arrayCount"] = sum(1 for item in parsed if isinstance(item, list)) + elif isinstance(parsed, dict): + stats["keyCount"] = len(parsed) + stats["keys"] = list(parsed.keys()) + stats["objectCount"] = sum(1 for v in parsed.values() if isinstance(v, dict)) + stats["arrayCount"] = sum(1 for v in parsed.values() if isinstance(v, list)) + except Exception as e: + logger.debug(f"JSON statistics extraction failed: {e}") + + elif formatExt == "xml": + try: + import xml.etree.ElementTree as ET + root = ET.fromstring(content) + stats["elementCount"] = len(list(root.iter())) + stats["attributeCount"] = sum(len(elem.attrib) for elem in root.iter()) + stats["rootElement"] = root.tag + stats["hasRoot"] = True + except Exception as e: + logger.debug(f"XML statistics extraction failed: {e}") + + return stats if stats else None + + except Exception as e: + logger.warning(f"Error extracting code file statistics: {str(e)}") + return None def _isFormatCompatible(self, deliveredFormat: str, expectedFormat: str) -> bool: """