From 53819f90c32b59b74dcf469e0c05c69b37d21e56 Mon Sep 17 00:00:00 2001 From: ValueOn AG
, or style attributes -11. For headings: Return plain text only, no HTML tags or styling -12. For images: Do NOT include base64 data in JSON - images are handled separately +1. Extract all data from the context provided. Do not skip or omit any data. +2. Extract data only from the provided context. Never invent, create, or generate data that is not in the context. +3. If the context contains no data, return empty structures (empty rows array for tables). +4. Aggregate all data into one element (e.g., one table). +5. For table: Extract all rows from the context. Return {{"headers": [...], "rows": []}} only if no data exists. +6. Format based on content_type ({contentType}). +7. No HTML/styling: Plain text only, no markup. ## OUTPUT FORMAT Return a JSON object with this structure: @@ -1874,16 +1882,19 @@ Return a JSON object with this structure: ] }} -CRITICAL: -- "content" MUST always be an object (never a string) -- For text content: Return plain text only, NO HTML tags, NO CSS styles, NO formatting markup -- Return ONLY valid JSON. Do not include any explanatory text outside the JSON. +Output requirements: +- "content" must be an object (never a string) +- Return only valid JSON - no text before, no text after, no comments, no explanations +- No invented data: Return empty structures if ContentParts have no data +- Extract all data: Process every ContentPart completely and include all extracted data -## CONTEXT (for reference only) -{contextText if contextText else ""} +## USER REQUEST (for context) ``` {userPrompt} ``` + +## CONTEXT +{contextText if contextText else ""} """ else: prompt = f"""# TASK: Generate Section Content @@ -1899,16 +1910,11 @@ LANGUAGE: Generate all content in {language.upper()} language. All text, titles, {contentPartsText if contentPartsText else "(No content parts specified for this section)"} ## INSTRUCTIONS -1. Generate content for section "{sectionId}" based on the generation hint above -2. Use the available content parts to populate this section -3. For extracted text: Format appropriately based on content_type ({contentType}) -4. Ensure the generated content is self-contained and understandable independently -5. Return ONLY a JSON object with an "elements" array -6. Each element should match the content_type: {contentType} -7. CRITICAL - NO HTML/STYLING: Do NOT include HTML tags, CSS styles, or any formatting markup in text content. Return plain text only. Formatting is handled automatically by the renderer. -8. For paragraphs: Return plain text only, no HTML tags like
, or style attributes -9. For headings: Return plain text only, no HTML tags or styling -10. For images: If you need to reference an image, describe it in altText. Do NOT include base64 data - images are handled separately +1. Extract data only from provided ContentParts. Never invent or generate data. +2. If ContentParts contain no data, return empty structures (empty rows array for tables). +3. Format based on content_type ({contentType}). +4. Return only valid JSON with "elements" array. +5. No HTML/styling: Plain text only, no markup. ## OUTPUT FORMAT Return a JSON object with this structure: @@ -1922,16 +1928,18 @@ Return a JSON object with this structure: ] }} -CRITICAL: -- "content" MUST always be an object (never a string) -- For text content: Return plain text only, NO HTML tags, NO CSS styles, NO formatting markup -- Return ONLY valid JSON. Do not include any explanatory text outside the JSON +Output requirements: +- "content" must be an object (never a string) +- Return only valid JSON, no explanatory text +- No invented data: Return empty structures if ContentParts have no data -## CONTEXT (for reference only) -{contextText if contextText else ""} +## USER REQUEST ``` {userPrompt} ``` + +## CONTEXT +{contextText if contextText else ""} """ return prompt diff --git a/modules/services/serviceAi/subStructureGeneration.py b/modules/services/serviceAi/subStructureGeneration.py index 88f2e1e1..a3db2072 100644 --- a/modules/services/serviceAi/subStructureGeneration.py +++ b/modules/services/serviceAi/subStructureGeneration.py @@ -250,44 +250,45 @@ Continue generating the remaining chapters now. language = self._getUserLanguage() logger.debug(f"Using language from services (user intention analysis) for structure generation: {language}") - prompt = f"""CRITICAL OUTPUT REQUIREMENT: This is a PLANNING task, not a generation task. You MUST return EXACTLY ONE complete JSON object. Do NOT generate multiple JSON objects, alternatives, or variations. Do NOT use separators like "---" between JSON objects. Return the single best JSON structure that matches the requirements below. + prompt = f"""# TASK: Generate Chapter Structure -USER REQUEST (for context): +This is a PLANNING task. Return EXACTLY ONE complete JSON object. Do not generate multiple JSON objects, alternatives, or variations. Do not use separators like "---" between JSON objects. + +## USER REQUEST (for context) ``` {userPrompt} ``` LANGUAGE: Generate all content in {language.upper()} language. All text, titles, headings, paragraphs, and content must be written in {language.upper()}. -AVAILABLE CONTENT PARTS: +## AVAILABLE CONTENT PARTS {contentPartsIndex} -TASK: Generate Chapter Structure for the documents to be generated. - -IMPORTANT - CHAPTER INDEPENDENCE: +## CHAPTER INDEPENDENCE - Each chapter is independent and self-contained - One chapter does NOT have information about another chapter - Each chapter must provide its own context and be understandable alone -CONTENT ASSIGNMENT: +## CONTENT ASSIGNMENT - Assign ContentParts to chapters via contentParts object -- For data extraction, the type of a contentPart (image, text, etc.) is NOT relevant - only what is specified in the instruction matters -- Include ALL relevant parts from same source when needed for structured data extraction +- For data extraction, the type of a contentPart (image, text, etc.) is not relevant - only what is specified in the instruction matters +- Include all relevant parts from same source when needed for structured data extraction - Each contentPart can have either: - "instruction": For AI extraction prompts (how to process/extract from this part) - "caption": For user-facing presentation (how to display/reference this part in the document) - Both can be present if needed - Chapters without contentParts can only generate generic content (not document-specific) -FORMATTING: -- Formatting is handled automatically - focus on CONTENT and STRUCTURE only +## FORMATTING +- Formatting is handled automatically - focus on content and structure only -CHAPTER STRUCTURE: +## CHAPTER STRUCTURE - chapter id, level (1, 2, 3, etc.), title - contentParts: {{"partId": {{"instruction": "..."}} or {{"caption": "..."}} or both}} - Compact mapping of part IDs to their extraction instructions and/or presentation captions -- generationHint: Self-contained description (if contentParts is empty, must be VERY DETAILED) +- generationHint: Self-contained description that reflects the user's intent for the specific data. If contentParts is empty, must be detailed. If contentParts are present, the hint should guide how to extract and structure the data according to the user's requirements (e.g., specific columns, format, structure) -RETURN JSON: +## OUTPUT FORMAT +Return JSON: {{ "metadata": {{ "title": "Document Title", diff --git a/modules/services/serviceExtraction/mainServiceExtraction.py b/modules/services/serviceExtraction/mainServiceExtraction.py index 7c4649e8..be38de05 100644 --- a/modules/services/serviceExtraction/mainServiceExtraction.py +++ b/modules/services/serviceExtraction/mainServiceExtraction.py @@ -652,6 +652,18 @@ class ExtractionService: mergeType="concatenate" ) + # Check if this is an elements response format (elements array structure) + # This is used for section content generation where multiple ContentParts are processed + isElementsResponse = self._isElementsResponse(content_parts) + + if isElementsResponse: + # Merge JSON elements responses intelligently (merge tables, combine elements) + logger.info(f"Detected 'elements' JSON response format - merging {len(content_parts)} JSON responses") + merged_json = self._mergeElementsResponses(content_parts) + merged_json_str = json.dumps(merged_json, indent=2, ensure_ascii=False) + logger.info(f"Successfully merged 'elements' JSON responses into single unified JSON ({len(merged_json_str)} chars)") + return merged_json_str + # Check if this is a JSON extraction response format (extracted_content structure) # If so, merge JSON structures intelligently before applying regular merging isJsonExtractionResponse = self._isJsonExtractionResponse(content_parts) @@ -736,11 +748,112 @@ class ExtractionService: return False + def _isElementsResponse(self, content_parts: List[ContentPart]) -> bool: + """Check if contentParts contain JSON responses with an 'elements' array (e.g., section content).""" + if not content_parts: + return False + + firstPartData = content_parts[0].data if content_parts[0].data else "" + if not isinstance(firstPartData, str): + return False + + strippedData = stripCodeFences(firstPartData.strip()) + if not strippedData.startswith(('{', '[')): + return False + + try: + parsed = json.loads(strippedData) + if isinstance(parsed, dict) and "elements" in parsed and isinstance(parsed["elements"], list): + return True + except: + pass + + return False + + def _mergeElementsResponses(self, content_parts: List[ContentPart]) -> Dict[str, Any]: + """Merge multiple JSON responses with an 'elements' array into one unified response. + Specifically designed to merge tables within the 'elements' array. + Empty tables (no rows) are ignored if a table with the same headers already has data. + """ + merged_elements = [] + table_headers_map: Dict[str, List[Dict[str, Any]]] = {} # headers_tuple -> [table_contents] + + for part in content_parts: + if not part.data: + continue + + # Handle multiple JSON blocks in a single response (separated by ---) + partDataBlocks = part.data.split('---') + + for blockData in partDataBlocks: + if not blockData.strip(): + continue + + try: + strippedData = stripCodeFences(blockData.strip()) + if not strippedData: + continue + + parsed = json.loads(strippedData) + if isinstance(parsed, dict) and "elements" in parsed and isinstance(parsed["elements"], list): + for element in parsed["elements"]: + if isinstance(element, dict) and element.get("type") == "table" and "content" in element: + table_content = element["content"] + headers = table_content.get("headers", []) + rows = table_content.get("rows", []) + + if headers: + headers_key = tuple(headers) + + # If table has no rows, only add it if no table with these headers exists yet + if not rows: + if headers_key not in table_headers_map: + # No table with these headers exists - keep empty table for now + table_headers_map[headers_key] = [] + # If a table with these headers already exists (with or without data), skip empty table + continue + + # Table has rows - add to merge map + if headers_key not in table_headers_map: + table_headers_map[headers_key] = [] + table_headers_map[headers_key].append(table_content) + else: + # Keep non-table elements as is, but avoid duplicates if possible + if element not in merged_elements: + merged_elements.append(element) + except Exception as e: + logger.warning(f"Failed to parse JSON elements response from part {part.id}: {str(e)}") + continue + + # Merge tables by headers - combine rows from tables with same headers + for headers_key, tables in table_headers_map.items(): + if not tables: + # Only empty tables with these headers - skip them + continue + + all_rows = [] + + for table_content in tables: + rows = table_content.get("rows", []) + all_rows.extend(rows) + + # Only add table if it has rows + if all_rows: + merged_elements.append({ + "type": "table", + "content": { + "headers": list(headers_key), + "rows": all_rows + } + }) + + return {"elements": merged_elements} + def _mergeJsonExtractionResponses(self, content_parts: List[ContentPart], originalContentParts: Optional[List[ContentPart]] = None) -> Dict[str, Any]: """Merge multiple JSON extraction responses into one unified response. Merges: - - Tables: Combines all table rows, preserves headers, removes duplicates + - Tables: Combines all table rows, preserves headers (duplicates preserved) - Text: Combines all text blocks - Headings: Combines all headings arrays - Lists: Combines all list items @@ -927,16 +1040,10 @@ class ExtractionService: for headers_key, tables in table_headers_map.items(): # Collect all rows from tables with same headers all_rows = [] - seen_rows = set() # Track duplicates using tuple representation for table in tables: rows = table.get("rows", []) - for row in rows: - # Convert row to tuple for duplicate detection - row_tuple = tuple(str(cell) for cell in row) - if row_tuple not in seen_rows: - seen_rows.add(row_tuple) - all_rows.append(row) + all_rows.extend(rows) # Create merged table if all_rows: @@ -1450,6 +1557,32 @@ class ExtractionService: contentPart, prompt, options, failoverModelList, aiObjects, None # Don't pass progressCallback to avoid double logging ) + # Write debug files for generation phase (section content generation) + # Check for DATA_GENERATE or DATA_ANALYSE (used for section generation) + isGenerationPhase = False + if options and hasattr(options, 'operationType'): + isGenerationPhase = (options.operationType == OperationTypeEnum.DATA_GENERATE or + options.operationType == OperationTypeEnum.DATA_ANALYSE) + + if isGenerationPhase: + if self.services and hasattr(self.services, 'utils') and hasattr(self.services.utils, 'writeDebugFile'): + try: + # Create debug filename with contentPart ID or label + partId = contentPart.id[:8] if contentPart.id else f"part_{partIndex+1}" + partLabelSafe = (contentPart.label or f"part_{partIndex+1}").replace(" ", "_").replace("/", "_").replace("\\", "_")[:30] + debugPrefix = f"generation_contentPart_{partId}_{partLabelSafe}" + + # Write prompt + self.services.utils.writeDebugFile(prompt, f"{debugPrefix}_prompt") + + # Write response + responseContent = partResult.content if partResult.content else "" + self.services.utils.writeDebugFile(responseContent, f"{debugPrefix}_response") + + logger.debug(f"Wrote debug files for contentPart {partId} (generation): {debugPrefix}_prompt, {debugPrefix}_response") + except Exception as debugError: + logger.warning(f"Failed to write debug file for contentPart {contentPart.id}: {str(debugError)}") + # Update completed count and log progress completedCount[0] += 1 if progressCallback: