From db1bcfcdeb8e00b398704a637549440b5febc5fc Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Fri, 3 Oct 2025 01:41:50 +0200
Subject: [PATCH] actions for web, extract, generate tested
---
modules/interfaces/interfaceAiObjects.py | 73 +-
modules/services/serviceAi/mainServiceAi.py | 414 +-
.../chunking/text_chunker.py | 4 +
.../formats/binary_extractor.py | 2 +-
.../formats/csv_extractor.py | 2 +-
.../formats/docx_extractor.py | 2 +-
.../formats/html_extractor.py | 2 +-
.../formats/image_extractor.py | 2 +-
.../formats/json_extractor.py | 2 +-
.../formats/pdf_extractor.py | 2 +-
.../formats/text_extractor.py | 2 +-
.../formats/xlsx_extractor.py | 2 +-
.../formats/xml_extractor.py | 2 +-
.../mainServiceExtraction.py | 8 +-
.../serviceExtraction/merging/table_merger.py | 2 +-
.../serviceExtraction/merging/text_merger.py | 2 +-
.../services/serviceExtraction/subPipeline.py | 87 +-
.../{utils/__init__.py => subUtils.py} | 2 -
.../mainServiceGeneration.py | 80 +-
.../renderers/base_renderer.py | 86 +
.../renderers/csv_renderer.py | 90 +
.../renderers/docx_renderer.py | 291 +
.../renderers/excel_renderer.py | 264 +
.../renderers/html_renderer.py | 94 +
.../renderers/json_renderer.py | 107 +
.../renderers/markdown_renderer.py | 89 +
.../renderers/pdf_renderer.py | 259 +
.../serviceGeneration/renderers/registry.py | 157 +
.../renderers/text_renderer.py | 129 +
modules/workflows/methods/methodAi.py | 9 +-
modules/workflows/methods/methodDocument.py | 633 +-
test_ai_document_response.md | 555 +
test_ai_with_documents.py | 214 +
test_generated_report.pdf | 1 +
test_generated_report_content.txt | 13 +
test_real_document_generation.py | 368 +
test_real_generated_report.docx | Bin 0 -> 41448 bytes
test_real_generated_report_content.txt | 13 +
...en_bauland_kanton-zuerich_trefferliste.txt | 121 -
...h_de_grundstueck_kaufen_kanton-zuerich.txt | 324 -
...latz_kanton_zuerich_grundstueck_kaufen.txt | 401 -
...en_bauland_kanton-zuerich_trefferliste.txt | 121 -
...h_de_grundstueck_kaufen_kanton-zuerich.txt | 324 -
...latz_kanton_zuerich_grundstueck_kaufen.txt | 401 -
....ch_de_kaufen_kanton-zurich_grundstuck.txt | 269 -
...en_bauland_kanton-zuerich_trefferliste.txt | 121 -
...h_de_grundstueck_kaufen_kanton-zuerich.txt | 324 -
...latz_kanton_zuerich_grundstueck_kaufen.txt | 401 -
.../additional_link_001_www.valueon.ch_.txt | 106 -
...ional_link_002_www.valueon.ch_services.txt | 110 -
...ional_link_003_www.valueon.ch_projects.txt | 354 -
...nal_link_004_www.valueon.ch_vernissage.txt | 68 -
...nal_link_005_www.valueon.ch_self-check.txt | 58 -
...ditional_link_006_www.valueon.ch_about.txt | 638 -
...tional_link_007_www.valueon.ch_contact.txt | 89 -
...tional_link_008_www.valueon.ch_privacy.txt | 103 -
...tional_link_009_www.valueon.ch_imprint.txt | 62 -
...al_link_010_www.moneyhouse.ch_de_terms.txt | 96 -
..._link_011_www.moneyhouse.ch_de_imprint.txt | 401 -
...ional_link_012_www.itreseller.ch_media.txt | 203 -
...itional_link_013_www.itreseller.ch_abo.txt | 157 -
..._link_014_www.itreseller.ch_newsletter.txt | 119 -
...itreseller.ch_rubriken_165_people.html.txt | 195 -
...reseller.ch_rubriken_163_business.html.txt | 195 -
...treseller.ch_rubriken_166_finance.html.txt | 173 -
...reseller.ch_rubriken_122_research.html.txt | 195 -
..._019_www.itreseller.ch_veranstaltungen.txt | 151 -
...tional_link_020_www.itreseller.ch_jobs.txt | 327 -
..._021_www.itreseller.ch_heftarchiv_2025.txt | 194 -
...-10_neuer-ceo-fuer-valueonmain-content.txt | 278 -
...onal_link_023_www.netzwoche.ch_contact.txt | 131 -
...nk_024_www.netzwoche.ch_bestofswissweb.txt | 260 -
...k_025_www.netzwoche.ch_bestofswissapps.txt | 227 -
...zwoche.ch_best-of-swiss-software-award.txt | 167 -
...ditional_link_027_www.netzwoche.ch_abo.txt | 190 -
...8_www.netzwoche.ch_newsletteranmeldung.txt | 151 -
...itional_link_029_www.netzwoche.ch_news.txt | 159 -
...ink_030_www.netzwoche.ch_tags_liveLive.txt | 7 -
...ww.netzwoche.ch_tags_wild-cardWildCard.txt | 7 -
...ional_link_032_www.itreseller.ch_media.txt | 203 -
...itional_link_033_www.itreseller.ch_abo.txt | 157 -
..._link_034_www.itreseller.ch_newsletter.txt | 119 -
...itreseller.ch_rubriken_165_people.html.txt | 195 -
...reseller.ch_rubriken_163_business.html.txt | 195 -
...treseller.ch_rubriken_166_finance.html.txt | 173 -
...reseller.ch_rubriken_122_research.html.txt | 195 -
..._039_www.itreseller.ch_veranstaltungen.txt | 151 -
...tional_link_040_www.itreseller.ch_jobs.txt | 327 -
..._041_www.itreseller.ch_heftarchiv_2025.txt | 194 -
.../main_url_001_www.valueon.ch_.txt | 107 -
.../main_url_002_www.valueon.ch_about.txt | 639 -
...se.ch_de_company_valueon-ag-4663161481.txt | 364 -
.../main_url_004_www.valueon.ch_.txt | 107 -
.../main_url_005_www.valueon.ch_about.txt | 639 -
...se.ch_de_company_valueon-ag-4663161481.txt | 364 -
...eller.ch_unternehmen_6046_Valueon.html.txt | 124 -
...news_2024-06-10_neuer-ceo-fuer-valueon.txt | 279 -
...alueon_vollzieht_Fuehrungswechsel.html.txt | 214 -
...eller.ch_unternehmen_6046_Valueon.html.txt | 124 -
...alueon_vollzieht_Fuehrungswechsel.html.txt | 214 -
...news_2024-06-10_neuer-ceo-fuer-valueon.txt | 279 -
.../main_url_013_www.valueon.ch_.txt | 107 -
.../main_url_014_www.valueon.ch_about.txt | 639 -
...se.ch_de_company_valueon-ag-4663161481.txt | 364 -
.../main_url_016_www.valueon.ch_.txt | 107 -
.../main_url_017_www.valueon.ch_about.txt | 639 -
...se.ch_de_company_valueon-ag-4663161481.txt | 364 -
...ional_link_001_www.valueon.ch_services.txt | 110 -
...ional_link_002_www.valueon.ch_projects.txt | 354 -
...nal_link_003_www.valueon.ch_vernissage.txt | 68 -
...nal_link_004_www.valueon.ch_self-check.txt | 58 -
...tional_link_005_www.valueon.ch_contact.txt | 89 -
...tional_link_006_www.valueon.ch_privacy.txt | 103 -
...tional_link_007_www.valueon.ch_imprint.txt | 62 -
...al_link_008_www.moneyhouse.ch_de_terms.txt | 96 -
..._link_009_www.moneyhouse.ch_de_imprint.txt | 401 -
...itional_link_010_www.moneyhouse.ch_de_.txt | 642 -
...3161481_reportscreditworthiness-report.txt | 643 -
...company_valueon-ag-4663161481_timeline.txt | 515 -
...company_valueon-ag-4663161481_messages.txt | 625 -
...erviewchnrCH02030306513uidCHE113416882.txt | 593 -
..._company_valueon-ag-4663161481_revenue.txt | 565 -
...eon-ag-4663161481_networkneighbourhood.txt | 743 -
.../main_url_001_www.valueon.ch_.txt | 107 -
.../main_url_002_www.valueon.ch_about.txt | 639 -
...se.ch_de_company_valueon-ag-4663161481.txt | 364 -
.../main_url_004_www.valueon.ch_.txt | 107 -
.../main_url_005_www.valueon.ch_about.txt | 639 -
...se.ch_de_company_valueon-ag-4663161481.txt | 364 -
...ional_link_001_www.valueon.ch_services.txt | 110 -
...ional_link_002_www.valueon.ch_projects.txt | 354 -
...nal_link_003_www.valueon.ch_vernissage.txt | 68 -
...nal_link_004_www.valueon.ch_self-check.txt | 58 -
...tional_link_005_www.valueon.ch_contact.txt | 89 -
...tional_link_006_www.valueon.ch_privacy.txt | 103 -
...tional_link_007_www.valueon.ch_imprint.txt | 62 -
...al_link_008_www.moneyhouse.ch_de_terms.txt | 96 -
..._link_009_www.moneyhouse.ch_de_imprint.txt | 401 -
...itional_link_010_www.moneyhouse.ch_de_.txt | 642 -
...3161481_reportscreditworthiness-report.txt | 623 -
...company_valueon-ag-4663161481_timeline.txt | 515 -
...company_valueon-ag-4663161481_messages.txt | 625 -
...erviewchnrCH02030306513uidCHE113416882.txt | 593 -
..._company_valueon-ag-4663161481_revenue.txt | 565 -
...eon-ag-4663161481_networkneighbourhood.txt | 743 -
...ional_link_017_www.itreseller.ch_media.txt | 203 -
...itional_link_018_www.itreseller.ch_abo.txt | 157 -
..._link_019_www.itreseller.ch_newsletter.txt | 119 -
...itreseller.ch_rubriken_165_people.html.txt | 195 -
...reseller.ch_rubriken_163_business.html.txt | 195 -
...treseller.ch_rubriken_166_finance.html.txt | 173 -
...reseller.ch_rubriken_122_research.html.txt | 195 -
..._024_www.itreseller.ch_veranstaltungen.txt | 151 -
...tional_link_025_www.itreseller.ch_jobs.txt | 327 -
..._026_www.itreseller.ch_heftarchiv_2025.txt | 194 -
.../main_url_001_www.valueon.ch_.txt | 107 -
.../main_url_002_www.valueon.ch_about.txt | 639 -
...se.ch_de_company_valueon-ag-4663161481.txt | 364 -
.../main_url_004_www.valueon.ch_about.txt | 639 -
...eller.ch_unternehmen_6046_Valueon.html.txt | 124 -
.../main_url_006_www.valueon.ch_.txt | 107 -
...se.ch_de_company_valueon-ag-4663161481.txt | 364 -
...24.ch_fr_terrain_acheter_canton-zurich.txt | 437 +
...h_it_terreno_acquistare_cantone-zurigo.txt | 437 +
..._link_003_www.comparis.ch_versicherung.txt | 487 +
...onal_link_004_www.comparis.ch_finanzen.txt | 408 +
...tional_link_005_www.comparis.ch_wohnen.txt | 423 +
...al_link_006_www.comparis.ch_mobilitaet.txt | 407 +
...al_link_007_www.comparis.ch_gesundheit.txt | 362 +
...ional_link_008_www.comparis.ch_telecom.txt | 353 +
...comparis.ch_neu-in-der-schweiz_default.txt | 434 +
..._www.comparis.ch_krankenkassen_default.txt | 253 +
...comparis.ch_lebensversicherung_default.txt | 589 +
...2_www.comparis.ch_rechtsschutz_default.txt | 496 +
...e_hypothekenintlinkh_MortgagesServices.txt | 255 +
..._c_de_ratgeberintlinkh_AdvisorRatgeber.txt | 215 +
...bauland_kanton-zuerich_trefferlisteep2.txt | 214 +
...bauland_kanton-zuerich_trefferlisteep4.txt | 200 +
...additional_link_017_www.homegate.ch_de.txt | 165 +
...18_realadvisor.ch_de_immobilien-kaufen.txt | 250 +
...19_realadvisor.ch_de_immobilien-mieten.txt | 235 +
...sor.ch_it_comprare_canton-zurigo_trama.txt | 356 +
...or.ch_fr_acheter_canton-zurich_terrain.txt | 436 +
...aladvisor.ch_en_buy_canton-zurich_plot.txt | 430 +
...onal_link_023_realadvisor.ch_de_kaufen.txt | 417 +
...24_realadvisor.ch_de_kaufen_grundstuck.txt | 398 +
...en_grundstuck_8162-steinmaur-B62R-PZDJ.txt | 237 +
...aufen_grundstuck_8340-hinwil-LDX9-L6M2.txt | 216 +
.../additional_link_027_www.immoyou.ch_.txt | 199 +
...link_028_www.immoyou.ch_kaufen-schweiz.txt | 423 +
..._www.immoyou.ch_wohnung-kaufen-schweiz.txt | 423 +
...030_www.immoyou.ch_haus-kaufen-schweiz.txt | 423 +
...link_031_www.immoyou.ch_mieten-schweiz.txt | 403 +
..._www.immoyou.ch_wohnung-mieten-schweiz.txt | 423 +
...033_www.immoyou.ch_haus-mieten-schweiz.txt | 419 +
...w.immoyou.ch_grundstuck-kaufen-schweiz.txt | 399 +
...h_de_grundstueck_kaufen_kanton-zuerich.txt | 443 +
...latz_kanton_zuerich_grundstueck_kaufen.txt | 516 +
...en_bauland_kanton-zuerich_trefferliste.txt | 174 +
....ch_de_kaufen_kanton-zurich_grundstuck.txt | 435 +
...you.ch_grundstuck-kaufen-kanton-zurich.txt | 325 +-
test_web_integration.py | 4 +-
test_web_integration_result.md | 23193 +++++++++-------
203 files changed, 30883 insertions(+), 44241 deletions(-)
rename modules/services/serviceExtraction/{utils/__init__.py => subUtils.py} (96%)
create mode 100644 modules/services/serviceGeneration/renderers/base_renderer.py
create mode 100644 modules/services/serviceGeneration/renderers/csv_renderer.py
create mode 100644 modules/services/serviceGeneration/renderers/docx_renderer.py
create mode 100644 modules/services/serviceGeneration/renderers/excel_renderer.py
create mode 100644 modules/services/serviceGeneration/renderers/html_renderer.py
create mode 100644 modules/services/serviceGeneration/renderers/json_renderer.py
create mode 100644 modules/services/serviceGeneration/renderers/markdown_renderer.py
create mode 100644 modules/services/serviceGeneration/renderers/pdf_renderer.py
create mode 100644 modules/services/serviceGeneration/renderers/registry.py
create mode 100644 modules/services/serviceGeneration/renderers/text_renderer.py
create mode 100644 test_ai_document_response.md
create mode 100644 test_ai_with_documents.py
create mode 100644 test_generated_report.pdf
create mode 100644 test_generated_report_content.txt
create mode 100644 test_real_document_generation.py
create mode 100644 test_real_generated_report.docx
create mode 100644 test_real_generated_report_content.txt
delete mode 100644 test_web_content_20251002_204000/main_url_001_www.homegate.ch_kaufen_bauland_kanton-zuerich_trefferliste.txt
delete mode 100644 test_web_content_20251002_204000/main_url_002_www.immoscout24.ch_de_grundstueck_kaufen_kanton-zuerich.txt
delete mode 100644 test_web_content_20251002_204000/main_url_003_www.comparis.ch_immobilien_marktplatz_kanton_zuerich_grundstueck_kaufen.txt
delete mode 100644 test_web_content_20251002_204000/main_url_004_www.homegate.ch_kaufen_bauland_kanton-zuerich_trefferliste.txt
delete mode 100644 test_web_content_20251002_204000/main_url_005_www.immoscout24.ch_de_grundstueck_kaufen_kanton-zuerich.txt
delete mode 100644 test_web_content_20251002_204000/main_url_006_www.comparis.ch_immobilien_marktplatz_kanton_zuerich_grundstueck_kaufen.txt
delete mode 100644 test_web_content_20251002_204000/main_url_008_realadvisor.ch_de_kaufen_kanton-zurich_grundstuck.txt
delete mode 100644 test_web_content_20251002_204000/main_url_009_www.homegate.ch_kaufen_bauland_kanton-zuerich_trefferliste.txt
delete mode 100644 test_web_content_20251002_204000/main_url_010_www.immoscout24.ch_de_grundstueck_kaufen_kanton-zuerich.txt
delete mode 100644 test_web_content_20251002_204000/main_url_011_www.comparis.ch_immobilien_marktplatz_kanton_zuerich_grundstueck_kaufen.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_001_www.valueon.ch_.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_002_www.valueon.ch_services.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_003_www.valueon.ch_projects.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_004_www.valueon.ch_vernissage.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_005_www.valueon.ch_self-check.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_006_www.valueon.ch_about.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_007_www.valueon.ch_contact.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_008_www.valueon.ch_privacy.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_009_www.valueon.ch_imprint.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_010_www.moneyhouse.ch_de_terms.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_011_www.moneyhouse.ch_de_imprint.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_012_www.itreseller.ch_media.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_013_www.itreseller.ch_abo.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_014_www.itreseller.ch_newsletter.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_015_www.itreseller.ch_rubriken_165_people.html.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_016_www.itreseller.ch_rubriken_163_business.html.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_017_www.itreseller.ch_rubriken_166_finance.html.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_018_www.itreseller.ch_rubriken_122_research.html.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_019_www.itreseller.ch_veranstaltungen.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_020_www.itreseller.ch_jobs.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_021_www.itreseller.ch_heftarchiv_2025.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_022_www.netzwoche.ch_news_2024-06-10_neuer-ceo-fuer-valueonmain-content.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_023_www.netzwoche.ch_contact.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_024_www.netzwoche.ch_bestofswissweb.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_025_www.netzwoche.ch_bestofswissapps.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_026_www.netzwoche.ch_best-of-swiss-software-award.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_027_www.netzwoche.ch_abo.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_028_www.netzwoche.ch_newsletteranmeldung.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_029_www.netzwoche.ch_news.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_030_www.netzwoche.ch_tags_liveLive.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_031_www.netzwoche.ch_tags_wild-cardWildCard.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_032_www.itreseller.ch_media.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_033_www.itreseller.ch_abo.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_034_www.itreseller.ch_newsletter.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_035_www.itreseller.ch_rubriken_165_people.html.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_036_www.itreseller.ch_rubriken_163_business.html.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_037_www.itreseller.ch_rubriken_166_finance.html.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_038_www.itreseller.ch_rubriken_122_research.html.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_039_www.itreseller.ch_veranstaltungen.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_040_www.itreseller.ch_jobs.txt
delete mode 100644 test_web_content_20251002_205603/additional_link_041_www.itreseller.ch_heftarchiv_2025.txt
delete mode 100644 test_web_content_20251002_205603/main_url_001_www.valueon.ch_.txt
delete mode 100644 test_web_content_20251002_205603/main_url_002_www.valueon.ch_about.txt
delete mode 100644 test_web_content_20251002_205603/main_url_003_www.moneyhouse.ch_de_company_valueon-ag-4663161481.txt
delete mode 100644 test_web_content_20251002_205603/main_url_004_www.valueon.ch_.txt
delete mode 100644 test_web_content_20251002_205603/main_url_005_www.valueon.ch_about.txt
delete mode 100644 test_web_content_20251002_205603/main_url_006_www.moneyhouse.ch_de_company_valueon-ag-4663161481.txt
delete mode 100644 test_web_content_20251002_205603/main_url_007_www.itreseller.ch_unternehmen_6046_Valueon.html.txt
delete mode 100644 test_web_content_20251002_205603/main_url_008_www.netzwoche.ch_news_2024-06-10_neuer-ceo-fuer-valueon.txt
delete mode 100644 test_web_content_20251002_205603/main_url_009_www.itreseller.ch_Artikel_101024_Valueon_vollzieht_Fuehrungswechsel.html.txt
delete mode 100644 test_web_content_20251002_205603/main_url_010_www.itreseller.ch_unternehmen_6046_Valueon.html.txt
delete mode 100644 test_web_content_20251002_205603/main_url_011_www.itreseller.ch_Artikel_101024_Valueon_vollzieht_Fuehrungswechsel.html.txt
delete mode 100644 test_web_content_20251002_205603/main_url_012_www.netzwoche.ch_news_2024-06-10_neuer-ceo-fuer-valueon.txt
delete mode 100644 test_web_content_20251002_205603/main_url_013_www.valueon.ch_.txt
delete mode 100644 test_web_content_20251002_205603/main_url_014_www.valueon.ch_about.txt
delete mode 100644 test_web_content_20251002_205603/main_url_015_www.moneyhouse.ch_de_company_valueon-ag-4663161481.txt
delete mode 100644 test_web_content_20251002_205603/main_url_016_www.valueon.ch_.txt
delete mode 100644 test_web_content_20251002_205603/main_url_017_www.valueon.ch_about.txt
delete mode 100644 test_web_content_20251002_205603/main_url_018_www.moneyhouse.ch_de_company_valueon-ag-4663161481.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_001_www.valueon.ch_services.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_002_www.valueon.ch_projects.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_003_www.valueon.ch_vernissage.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_004_www.valueon.ch_self-check.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_005_www.valueon.ch_contact.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_006_www.valueon.ch_privacy.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_007_www.valueon.ch_imprint.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_008_www.moneyhouse.ch_de_terms.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_009_www.moneyhouse.ch_de_imprint.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_010_www.moneyhouse.ch_de_.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_011_www.moneyhouse.ch_de_company_valueon-ag-4663161481_reportscreditworthiness-report.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_012_www.moneyhouse.ch_de_company_valueon-ag-4663161481_timeline.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_013_www.moneyhouse.ch_de_company_valueon-ag-4663161481_messages.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_014_www.moneyhouse.ch_de_commercialregister_overviewchnrCH02030306513uidCHE113416882.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_015_www.moneyhouse.ch_de_company_valueon-ag-4663161481_revenue.txt
delete mode 100644 test_web_content_20251002_212142/additional_link_016_www.moneyhouse.ch_de_company_valueon-ag-4663161481_networkneighbourhood.txt
delete mode 100644 test_web_content_20251002_212142/main_url_001_www.valueon.ch_.txt
delete mode 100644 test_web_content_20251002_212142/main_url_002_www.valueon.ch_about.txt
delete mode 100644 test_web_content_20251002_212142/main_url_003_www.moneyhouse.ch_de_company_valueon-ag-4663161481.txt
delete mode 100644 test_web_content_20251002_212142/main_url_004_www.valueon.ch_.txt
delete mode 100644 test_web_content_20251002_212142/main_url_005_www.valueon.ch_about.txt
delete mode 100644 test_web_content_20251002_212142/main_url_006_www.moneyhouse.ch_de_company_valueon-ag-4663161481.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_001_www.valueon.ch_services.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_002_www.valueon.ch_projects.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_003_www.valueon.ch_vernissage.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_004_www.valueon.ch_self-check.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_005_www.valueon.ch_contact.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_006_www.valueon.ch_privacy.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_007_www.valueon.ch_imprint.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_008_www.moneyhouse.ch_de_terms.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_009_www.moneyhouse.ch_de_imprint.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_010_www.moneyhouse.ch_de_.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_011_www.moneyhouse.ch_de_company_valueon-ag-4663161481_reportscreditworthiness-report.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_012_www.moneyhouse.ch_de_company_valueon-ag-4663161481_timeline.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_013_www.moneyhouse.ch_de_company_valueon-ag-4663161481_messages.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_014_www.moneyhouse.ch_de_commercialregister_overviewchnrCH02030306513uidCHE113416882.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_015_www.moneyhouse.ch_de_company_valueon-ag-4663161481_revenue.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_016_www.moneyhouse.ch_de_company_valueon-ag-4663161481_networkneighbourhood.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_017_www.itreseller.ch_media.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_018_www.itreseller.ch_abo.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_019_www.itreseller.ch_newsletter.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_020_www.itreseller.ch_rubriken_165_people.html.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_021_www.itreseller.ch_rubriken_163_business.html.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_022_www.itreseller.ch_rubriken_166_finance.html.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_023_www.itreseller.ch_rubriken_122_research.html.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_024_www.itreseller.ch_veranstaltungen.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_025_www.itreseller.ch_jobs.txt
delete mode 100644 test_web_content_20251002_212426/additional_link_026_www.itreseller.ch_heftarchiv_2025.txt
delete mode 100644 test_web_content_20251002_212426/main_url_001_www.valueon.ch_.txt
delete mode 100644 test_web_content_20251002_212426/main_url_002_www.valueon.ch_about.txt
delete mode 100644 test_web_content_20251002_212426/main_url_003_www.moneyhouse.ch_de_company_valueon-ag-4663161481.txt
delete mode 100644 test_web_content_20251002_212426/main_url_004_www.valueon.ch_about.txt
delete mode 100644 test_web_content_20251002_212426/main_url_005_www.itreseller.ch_unternehmen_6046_Valueon.html.txt
delete mode 100644 test_web_content_20251002_212426/main_url_006_www.valueon.ch_.txt
delete mode 100644 test_web_content_20251002_212426/main_url_007_www.moneyhouse.ch_de_company_valueon-ag-4663161481.txt
create mode 100644 test_web_content_20251002_215219/additional_link_001_www.immoscout24.ch_fr_terrain_acheter_canton-zurich.txt
create mode 100644 test_web_content_20251002_215219/additional_link_002_www.immoscout24.ch_it_terreno_acquistare_cantone-zurigo.txt
create mode 100644 test_web_content_20251002_215219/additional_link_003_www.comparis.ch_versicherung.txt
create mode 100644 test_web_content_20251002_215219/additional_link_004_www.comparis.ch_finanzen.txt
create mode 100644 test_web_content_20251002_215219/additional_link_005_www.comparis.ch_wohnen.txt
create mode 100644 test_web_content_20251002_215219/additional_link_006_www.comparis.ch_mobilitaet.txt
create mode 100644 test_web_content_20251002_215219/additional_link_007_www.comparis.ch_gesundheit.txt
create mode 100644 test_web_content_20251002_215219/additional_link_008_www.comparis.ch_telecom.txt
create mode 100644 test_web_content_20251002_215219/additional_link_009_www.comparis.ch_neu-in-der-schweiz_default.txt
create mode 100644 test_web_content_20251002_215219/additional_link_010_www.comparis.ch_krankenkassen_default.txt
create mode 100644 test_web_content_20251002_215219/additional_link_011_www.comparis.ch_lebensversicherung_default.txt
create mode 100644 test_web_content_20251002_215219/additional_link_012_www.comparis.ch_rechtsschutz_default.txt
create mode 100644 test_web_content_20251002_215219/additional_link_013_www.homegate.ch_c_de_hypothekenintlinkh_MortgagesServices.txt
create mode 100644 test_web_content_20251002_215219/additional_link_014_www.homegate.ch_c_de_ratgeberintlinkh_AdvisorRatgeber.txt
create mode 100644 test_web_content_20251002_215219/additional_link_015_www.homegate.ch_kaufen_bauland_kanton-zuerich_trefferlisteep2.txt
create mode 100644 test_web_content_20251002_215219/additional_link_016_www.homegate.ch_kaufen_bauland_kanton-zuerich_trefferlisteep4.txt
create mode 100644 test_web_content_20251002_215219/additional_link_017_www.homegate.ch_de.txt
create mode 100644 test_web_content_20251002_215219/additional_link_018_realadvisor.ch_de_immobilien-kaufen.txt
create mode 100644 test_web_content_20251002_215219/additional_link_019_realadvisor.ch_de_immobilien-mieten.txt
create mode 100644 test_web_content_20251002_215219/additional_link_020_realadvisor.ch_it_comprare_canton-zurigo_trama.txt
create mode 100644 test_web_content_20251002_215219/additional_link_021_realadvisor.ch_fr_acheter_canton-zurich_terrain.txt
create mode 100644 test_web_content_20251002_215219/additional_link_022_realadvisor.ch_en_buy_canton-zurich_plot.txt
create mode 100644 test_web_content_20251002_215219/additional_link_023_realadvisor.ch_de_kaufen.txt
create mode 100644 test_web_content_20251002_215219/additional_link_024_realadvisor.ch_de_kaufen_grundstuck.txt
create mode 100644 test_web_content_20251002_215219/additional_link_025_realadvisor.ch_de_kaufen_grundstuck_8162-steinmaur-B62R-PZDJ.txt
create mode 100644 test_web_content_20251002_215219/additional_link_026_realadvisor.ch_de_kaufen_grundstuck_8340-hinwil-LDX9-L6M2.txt
create mode 100644 test_web_content_20251002_215219/additional_link_027_www.immoyou.ch_.txt
create mode 100644 test_web_content_20251002_215219/additional_link_028_www.immoyou.ch_kaufen-schweiz.txt
create mode 100644 test_web_content_20251002_215219/additional_link_029_www.immoyou.ch_wohnung-kaufen-schweiz.txt
create mode 100644 test_web_content_20251002_215219/additional_link_030_www.immoyou.ch_haus-kaufen-schweiz.txt
create mode 100644 test_web_content_20251002_215219/additional_link_031_www.immoyou.ch_mieten-schweiz.txt
create mode 100644 test_web_content_20251002_215219/additional_link_032_www.immoyou.ch_wohnung-mieten-schweiz.txt
create mode 100644 test_web_content_20251002_215219/additional_link_033_www.immoyou.ch_haus-mieten-schweiz.txt
create mode 100644 test_web_content_20251002_215219/additional_link_034_www.immoyou.ch_grundstuck-kaufen-schweiz.txt
create mode 100644 test_web_content_20251002_215219/main_url_001_www.immoscout24.ch_de_grundstueck_kaufen_kanton-zuerich.txt
create mode 100644 test_web_content_20251002_215219/main_url_002_www.comparis.ch_immobilien_marktplatz_kanton_zuerich_grundstueck_kaufen.txt
create mode 100644 test_web_content_20251002_215219/main_url_003_www.homegate.ch_kaufen_bauland_kanton-zuerich_trefferliste.txt
create mode 100644 test_web_content_20251002_215219/main_url_004_realadvisor.ch_de_kaufen_kanton-zurich_grundstuck.txt
rename test_web_content_20251002_204000/main_url_007_www.immoyou.ch_grundstuck-kaufen-kanton-zurich.txt => test_web_content_20251002_215219/main_url_005_www.immoyou.ch_grundstuck-kaufen-kanton-zurich.txt (82%)
diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py
index 38c5cb1e..d9c21ff1 100644
--- a/modules/interfaces/interfaceAiObjects.py
+++ b/modules/interfaces/interfaceAiObjects.py
@@ -596,7 +596,32 @@ class AiObjects:
"""Extract links from HTML/Markdown content."""
try:
import re
- from urllib.parse import urljoin, urlparse
+ from urllib.parse import urljoin, urlparse, quote, urlunparse
+
+ def _cleanUrl(url: str) -> str:
+ """Clean and encode URL to remove spaces and invalid characters."""
+ # Remove quotes and extra spaces
+ url = url.strip().strip('"\'')
+
+ # If it's a relative URL, make it absolute first
+ if not url.startswith(('http://', 'https://')):
+ url = urljoin(base_url, url)
+
+ # Parse and re-encode the URL properly
+ parsed = urlparse(url)
+ if parsed.query:
+ # Encode query parameters properly
+ encoded_query = quote(parsed.query, safe='=&')
+ url = urlunparse((
+ parsed.scheme,
+ parsed.netloc,
+ parsed.path,
+ parsed.params,
+ encoded_query,
+ parsed.fragment
+ ))
+
+ return url
links = []
@@ -605,39 +630,45 @@ class AiObjects:
html_links = re.findall(html_link_pattern, content, re.IGNORECASE)
for url in html_links:
- if url and (url.startswith('http://') or url.startswith('https://')):
- links.append(url)
- elif url and not url.startswith('#') and not url.startswith('javascript:'):
- # Convert relative URLs to absolute URLs
- absolute_url = urljoin(base_url, url)
- links.append(absolute_url)
+ if url and not url.startswith('#') and not url.startswith('javascript:'):
+ try:
+ cleaned_url = _cleanUrl(url)
+ links.append(cleaned_url)
+ logger.debug(f"Extracted HTML link: {url} -> {cleaned_url}")
+ except Exception as e:
+ logger.debug(f"Failed to clean HTML link {url}: {e}")
# Extract markdown links: [text](url) format
markdown_link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
markdown_links = re.findall(markdown_link_pattern, content)
for text, url in markdown_links:
- if url and (url.startswith('http://') or url.startswith('https://')):
- absolute_url = urljoin(base_url, url)
- # Only keep URLs from the same domain
- if urlparse(absolute_url).netloc == urlparse(base_url).netloc:
- links.append(absolute_url)
- elif url and not url.startswith('#'):
- absolute_url = urljoin(base_url, url)
- if urlparse(absolute_url).netloc == urlparse(base_url).netloc:
- links.append(absolute_url)
+ if url and not url.startswith('#'):
+ try:
+ cleaned_url = _cleanUrl(url)
+ # Only keep URLs from the same domain
+ if urlparse(cleaned_url).netloc == urlparse(base_url).netloc:
+ links.append(cleaned_url)
+ logger.debug(f"Extracted markdown link: {url} -> {cleaned_url}")
+ except Exception as e:
+ logger.debug(f"Failed to clean markdown link {url}: {e}")
# Extract plain URLs in the text
url_pattern = r'https?://[^\s\)]+'
plain_urls = re.findall(url_pattern, content)
for url in plain_urls:
- clean_url = url.rstrip('.,;!?')
- absolute_url = urljoin(base_url, clean_url)
- if urlparse(absolute_url).netloc == urlparse(base_url).netloc:
- if absolute_url not in links: # Avoid duplicates
- links.append(absolute_url)
+ try:
+ clean_url = url.rstrip('.,;!?')
+ cleaned_url = _cleanUrl(clean_url)
+ if urlparse(cleaned_url).netloc == urlparse(base_url).netloc:
+ if cleaned_url not in links: # Avoid duplicates
+ links.append(cleaned_url)
+ logger.debug(f"Extracted plain URL: {url} -> {cleaned_url}")
+ except Exception as e:
+ logger.debug(f"Failed to clean plain URL {url}: {e}")
+ logger.debug(f"Total links extracted and cleaned: {len(links)}")
return links
except Exception as e:
diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index e67c1958..00f67f8a 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -53,46 +53,8 @@ class AiService:
logger.info("AiObjects.create() completed")
return instance
- # AI Text Generation
- async def callAiText(
- self,
- prompt: str,
- documents: Optional[List[ChatDocument]] = None,
- processDocumentsIndividually: bool = False,
- options: Optional[AiCallOptions] = None,
- ) -> str:
- """Call AI for text generation using interface.call()."""
- try:
- documentContent = ""
- if documents:
- documentContent = await self._processDocumentsForAi(
- documents,
- options.operationType if options else "general",
- options.compressContext if options else True,
- options.processDocumentsIndividually if options else processDocumentsIndividually,
- prompt,
- options
- )
-
- effectiveOptions = options or AiCallOptions()
- # Compute maxContextBytes if not provided: conservative defaults per model tag could be added here
- if options and options.maxContextBytes is None:
- options.maxContextBytes = 16000 # bytes, conservative default if model limit unknown
-
- request = AiCallRequest(
- prompt=prompt,
- context=documentContent or None,
- options=effectiveOptions,
- )
-
- response = await self.aiObjects.call(request)
- return response.content
- except Exception as e:
- logger.error(f"Error in AI text generation: {str(e)}")
- return f"Error: {str(e)}"
-
# AI Image Analysis
- async def callAiImage(
+ async def readImage(
self,
prompt: str,
imageData: Union[str, bytes],
@@ -146,40 +108,40 @@ class AiService:
# Use AI to generate optimized Tavily search query and selection strategy
query_optimizer_prompt = f"""You are a search query optimizer and web page selector.
-Given a user query, perform two tasks:
+ Given a user query, perform two tasks:
-TASK 1 - GENERATE TAVILY SEARCH QUERY:
-Analyze the user's intent and extract the core search terms.
-- For entity-specific queries (companies, people, products): Use entity name + key identifiers
-- For informational queries (how to, what is): Use core concept keywords
-- For transactional queries (where to buy, find services): Use action + category
-- Keep it 2-6 words maximum, keyword format only
+ TASK 1 - GENERATE TAVILY SEARCH QUERY:
+ Analyze the user's intent and extract the core search terms.
+ - For entity-specific queries (companies, people, products): Use entity name + key identifiers
+ - For informational queries (how to, what is): Use core concept keywords
+ - For transactional queries (where to buy, find services): Use action + category
+ - Keep it 2-6 words maximum, keyword format only
-TASK 2 - DEFINE URL SELECTION STRATEGY:
-Determine what type of results the user needs:
+ TASK 2 - DEFINE URL SELECTION STRATEGY:
+ Determine what type of results the user needs:
-A) SINGLE AUTHORITATIVE SOURCE
- Use when: Looking for specific entity information (company profile, person bio, specific product)
- Select: Official website, primary domain, or most authoritative single page
-
-B) MULTIPLE DIVERSE SOURCES
- Use when: Comparing options, finding services, shopping, research across sources
- Select: Multiple relevant URLs (5-15), prioritizing diversity and relevance
-
-C) SPECIFIC PAGE TYPE
- Use when: Looking for particular content (documentation, pricing, contact, careers)
- Select: Deep links to specific page types on relevant sites
+ A) SINGLE AUTHORITATIVE SOURCE
+ Use when: Looking for specific entity information (company profile, person bio, specific product)
+ Select: Official website, primary domain, or most authoritative single page
+
+ B) MULTIPLE DIVERSE SOURCES
+ Use when: Comparing options, finding services, shopping, research across sources
+ Select: Multiple relevant URLs (5-15), prioritizing diversity and relevance
+
+ C) SPECIFIC PAGE TYPE
+ Use when: Looking for particular content (documentation, pricing, contact, careers)
+ Select: Deep links to specific page types on relevant sites
-Return your response in this exact JSON format:
-{{
- "search_query": "your generated search query",
- "selection_strategy": "single|multiple|specific_page",
- "selection_criteria": "description of what URLs to prioritize",
- "expected_url_patterns": ["pattern1", "pattern2"],
- "estimated_result_count": number
-}}
+ Return your response in this exact JSON format:
+ {{
+ "search_query": "your generated search query",
+ "selection_strategy": "single|multiple|specific_page",
+ "selection_criteria": "description of what URLs to prioritize",
+ "expected_url_patterns": ["pattern1", "pattern2"],
+ "estimated_result_count": number
+ }}
-USER QUERY: {request.search_query}"""
+ USER QUERY: {request.search_query}"""
# Get AI response for query optimization
ai_request = AiCallRequest(
@@ -248,14 +210,13 @@ USER QUERY: {request.search_query}"""
# Create a prompt for AI to identify main URLs based on user's intention
ai_prompt = f"""
-Select the most relevant URLs from these search results:
+ Select the most relevant URLs from these search results:
-{chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])}
+ {chr(10).join([f"{i+1}. {url}" for i, url in enumerate(search_urls)])}
-Return only the URLs that are most relevant for the user's query.
-One URL per line.
-"""
-
+ Return only the URLs that are most relevant for the user's query.
+ One URL per line.
+ """
# Create AI call request
ai_request = AiCallRequest(
prompt=ai_prompt,
@@ -397,7 +358,7 @@ One URL per line.
"operationType": operationType,
"processDocumentsIndividually": processIndividually,
"maxSize": maxContextBytes,
- "chunkAllowed": options.chunkAllowed if options else True,
+ "chunkAllowed": not options.compressContext if options else True,
"textChunkSize": int(maxContextBytes * 0.3), # 30% of max for text chunks
"imageChunkSize": int(maxContextBytes * 0.5), # 50% of max for image chunks
"imageMaxPixels": 1024 * 1024, # 1MP default
@@ -534,18 +495,18 @@ One URL per line.
if not documents:
return ""
- # Calculate model-derived size limits
- maxContextBytes = self._calculateMaxContextBytes(options)
+ # Get model capabilities for size calculation
+ model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options)
# Build extraction options for chunking
extractionOptions: Dict[str, Any] = {
"prompt": prompt,
"operationType": options.operationType if options else "general",
"processDocumentsIndividually": True, # Process each document separately
- "maxSize": maxContextBytes,
+ "maxSize": model_capabilities["maxContextBytes"],
"chunkAllowed": True,
- "textChunkSize": int(maxContextBytes * 0.3),
- "imageChunkSize": int(maxContextBytes * 0.5),
+ "textChunkSize": model_capabilities["textChunkSize"],
+ "imageChunkSize": model_capabilities["imageChunkSize"],
"imageMaxPixels": 1024 * 1024,
"imageQuality": 85,
"mergeStrategy": {
@@ -555,6 +516,8 @@ One URL per line.
},
}
+ logger.debug(f"Per-chunk extraction options: {extractionOptions}")
+
try:
# Extract content with chunking
extractionResult = self.extractionService.extractContent(documents, extractionOptions)
@@ -570,7 +533,7 @@ One URL per line.
if part.typeGroup == "image":
# Process image with AI
try:
- aiResult = await self.callAiImage(
+ aiResult = await self.readImage(
prompt=prompt,
imageData=part.data,
mimeType=part.mimeType,
@@ -584,23 +547,36 @@ One URL per line.
elif part.typeGroup in ("text", "table", "structure"):
# Process text content with AI
try:
- aiResult = await self.callAiText(
+ logger.info(f"=== PROCESSING CHUNK {len(aiResults) + 1} ===")
+ logger.info(f"Chunk size: {len(part.data)} chars")
+ logger.info(f"Chunk preview: {part.data[:200]}...")
+
+ # Create AI call request for this chunk
+ request = AiCallRequest(
prompt=prompt,
- documents=None,
+ context=part.data,
options=options
)
- aiResults.append(aiResult)
+
+ # Make the call using AiObjects
+ response = await self.aiObjects.call(request)
+ aiResults.append(response.content)
+
+ logger.info(f"Chunk {len(aiResults)} processed: {len(response.content)} chars response")
+
except Exception as e:
logger.warning(f"Error processing text chunk: {str(e)}")
aiResults.append(f"[Error processing text: {str(e)}]")
# Merge AI results using ExtractionService
- mergeStrategy = {
- "groupBy": "typeGroup",
- "orderBy": "id",
- "mergeType": "concatenate",
- "chunkSeparator": "\n\n---\n\n"
- }
+ from modules.datamodels.datamodelExtraction import MergeStrategy
+
+ mergeStrategy = MergeStrategy(
+ groupBy="typeGroup",
+ orderBy="id",
+ mergeType="concatenate",
+ chunkSeparator="\n\n---\n\n"
+ )
mergedContent = self.extractionService.mergeAiResults(
extractionResult,
@@ -675,6 +651,9 @@ One URL per line.
if call_type == "planning":
return await self._callAiPlanning(prompt, placeholders, options)
else:
+ # Set processDocumentsIndividually from the legacy parameter if not set in options
+ if options.processDocumentsIndividually is None and documents:
+ options.processDocumentsIndividually = False # Default to batch processing
return await self._callAiText(prompt, documents, options)
def _determineCallType(self, documents: Optional[List[ChatDocument]], operation_type: str) -> str:
@@ -731,99 +710,182 @@ One URL per line.
"""
Handle text calls with document processing through ExtractionService.
"""
- # Get available models for text processing
- models = self._getModelsForOperation("text", options)
+ # Determine processing strategy based on options
+ if options.processDocumentsIndividually and documents:
+ # Use per-chunk processing for individual document processing
+ return await self._processDocumentsPerChunk(documents, prompt, options)
- for model in models:
- try:
- # Determine processing strategy based on options
- if options.processDocumentsIndividually and documents:
- # Use per-chunk processing for individual document processing
- return await self._processDocumentsPerChunk(documents, prompt, options)
+ # Check if we need chunking - if so, use per-chunk processing
+ if documents and not options.compressContext:
+ # Get model capabilities to check if chunking will be needed
+ model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options)
+ total_doc_size = sum(doc.fileSize or 0 for doc in documents)
+
+ if total_doc_size > model_capabilities["maxContextBytes"]:
+ logger.info(f"Document size ({total_doc_size}) exceeds model capacity ({model_capabilities['maxContextBytes']}), using per-chunk processing")
+ return await self._processDocumentsPerChunk(documents, prompt, options)
+
+ # Extract and process documents using ExtractionService
+ context = ""
+ if documents:
+ logger.info(f"=== EXTRACTING CONTENT FROM {len(documents)} DOCUMENTS ===")
+
+ # Get model capabilities for size calculation
+ model_capabilities = self._getModelCapabilitiesForContent(prompt, documents, options)
+
+ # Use new ChatDocument-based API
+ extraction_options = {
+ "prompt": prompt,
+ "operationType": options.operationType,
+ "processDocumentsIndividually": options.processDocumentsIndividually,
+ "maxSize": options.maxContextBytes or model_capabilities["maxContextBytes"],
+ "chunkAllowed": not options.compressContext,
+ "textChunkSize": model_capabilities["textChunkSize"],
+ "imageChunkSize": model_capabilities["imageChunkSize"],
+ "imageMaxPixels": 1024 * 1024,
+ "imageQuality": 85,
+ "mergeStrategy": {"groupBy": "typeGroup", "orderBy": "id", "mergeType": "concatenate"}
+ }
+
+ logger.debug(f"Extraction options: {extraction_options}")
+
+ extracted_content = self.extractionService.extractContent(
+ documents=documents,
+ options=extraction_options
+ )
+
+ logger.info(f"Extraction completed: {len(extracted_content)} documents")
+
+ # Build context from list of ExtractedContent
+ if isinstance(extracted_content, list):
+ context_parts = []
+ chunk_count = 0
+ for ec in extracted_content:
+ for p in ec.parts:
+ if p.typeGroup in ["text", "table", "structure"] and p.data:
+ if p.metadata.get("chunk", False):
+ chunk_count += 1
+ context_parts.append(p.data)
+ elif p.typeGroup == "image" and p.data:
+ # Process image with AI using user prompt
+ try:
+ imageResult = await self.aiObjects.callImage(
+ prompt=prompt,
+ imageData=p.data,
+ mimeType=p.mimeType
+ )
+ context_parts.append(f"[Image Analysis]: {imageResult}")
+ except Exception as e:
+ logger.warning(f"AI image processing failed: {e}")
+ context_parts.append(f"[Image Analysis Failed]: {str(e)}")
- # Extract and process documents using ExtractionService
+ if chunk_count > 0:
+ logger.debug(f"=== PROCESSING CHUNKED CONTENT ===")
+ logger.debug(f"Total chunks: {chunk_count}")
+ logger.debug(f"Total context parts: {len(context_parts)}")
+
+ context = "\n\n---\n\n".join(context_parts)
+ else:
context = ""
- if documents:
- logger.info(f"=== EXTRACTING CONTENT FROM {len(documents)} DOCUMENTS ===")
-
- # Use new ChatDocument-based API
- extracted_content = self.extractionService.extractContent(
- documents=documents,
- options={
- "prompt": prompt,
- "operationType": options.operationType,
- "processDocumentsIndividually": options.processDocumentsIndividually,
- "maxSize": options.maxContextBytes or int(model.maxTokens * 0.9),
- "chunkAllowed": not options.compressContext,
- "textChunkSize": int((options.maxContextBytes or model.maxTokens * 4) * 0.3),
- "imageChunkSize": int((options.maxContextBytes or model.maxTokens * 4) * 0.5),
- "imageMaxPixels": 1024 * 1024,
- "imageQuality": 85,
- "mergeStrategy": {"groupBy": "typeGroup", "orderBy": "id", "mergeType": "concatenate"}
- }
- )
-
- logger.info(f"Extraction completed: {len(extracted_content)} documents")
-
- # Build context from list of ExtractedContent
- if isinstance(extracted_content, list):
- context_parts = []
- for ec in extracted_content:
- for p in ec.parts:
- if p.typeGroup in ["text", "table", "structure"] and p.data:
- context_parts.append(p.data)
- elif p.typeGroup == "image" and p.data:
- # Process image with AI using user prompt
- try:
- imageResult = await self.aiObjects.callImage(
- prompt=prompt,
- imageData=p.data,
- mimeType=p.mimeType
- )
- context_parts.append(f"[Image Analysis]: {imageResult}")
- except Exception as e:
- logger.warning(f"AI image processing failed: {e}")
- context_parts.append(f"[Image Analysis Failed]: {str(e)}")
- context = "\n\n---\n\n".join(context_parts)
- else:
- context = ""
-
- # Check size and reduce if needed
- full_prompt = prompt + "\n\n" + context if context else prompt
- logger.debug(f"AI call: {len(full_prompt)} chars (prompt: {len(prompt)}, context: {len(context)})")
-
- if self._exceedsTokenLimit(full_prompt, model, options.safetyMargin):
- full_prompt = self._reduceTextPrompt(prompt, context, model, options)
- logger.debug(f"Prompt reduced to {len(full_prompt)} chars")
-
- # Make AI call using the connector directly
- result = await self._makeAiCall(full_prompt, model, options)
- logger.debug(f"=== AI RESPONSE ===")
- logger.debug(f"Response length: {len(result)} chars")
- logger.debug(f"Response preview: {result[:200]}...")
- return result
-
- except Exception as e:
- logger.warning(f"Text model {model.name} failed: {e}")
- continue
- raise Exception("All text models failed - check model availability and capabilities")
+ # Check size and reduce if needed
+ full_prompt = prompt + "\n\n" + context if context else prompt
+ logger.debug(f"AI call: {len(full_prompt)} chars (prompt: {len(prompt)}, context: {len(context)})")
+
+ # Use AiObjects to select the best model and make the call
+ try:
+ # Create AI call request
+ request = AiCallRequest(
+ prompt=full_prompt,
+ context="", # Context is already included in the prompt
+ options=options
+ )
+
+ # Make the call using AiObjects (which handles model selection)
+ response = await self.aiObjects.call(request)
+ logger.debug(f"=== AI RESPONSE ===")
+ logger.debug(f"Response length: {len(response.content)} chars")
+ logger.debug(f"Response preview: {response.content[:200]}...")
+ return response.content
+
+ except Exception as e:
+ logger.error(f"AI call failed: {e}")
+ raise Exception(f"AI call failed: {e}")
- async def _makeAiCall(self, prompt: str, model: ModelCapabilities, options: AiCallOptions) -> str:
- """Make actual AI call using the connector."""
- if not self.aiObjects:
- raise Exception("AI objects not initialized")
+
+ def _getModelCapabilitiesForContent(self, prompt: str, documents: Optional[List[ChatDocument]], options: AiCallOptions) -> Dict[str, int]:
+ """
+ Get model capabilities for content processing, including appropriate size limits for chunking.
+ """
+ # Estimate total content size
+ prompt_size = len(prompt.encode('utf-8'))
+ document_size = 0
+ if documents:
+ # Rough estimate of document content size
+ for doc in documents:
+ document_size += doc.fileSize or 0
- # Create AI call request
- request = AiCallRequest(
- prompt=prompt,
- context="", # Context is already included in the prompt
- options=options
- )
+ total_size = prompt_size + document_size
- # Make the call
- response = await self.aiObjects.call(request)
- return response.content
+ # Use AiObjects to select the best model for this content size
+ # We'll simulate the model selection by checking available models
+ from modules.interfaces.interfaceAiObjects import aiModels
+
+ # Find the best model for this content size and operation
+ best_model = None
+ best_context_length = 0
+
+ for model_name, model_info in aiModels.items():
+ context_length = model_info.get("contextLength", 0)
+
+ # Skip models with no context length or too small for content
+ if context_length == 0:
+ continue
+
+ # Check if model supports the operation type
+ capabilities = model_info.get("capabilities", [])
+ if options.operationType == OperationType.IMAGE_ANALYSIS and "image_analysis" not in capabilities:
+ continue
+ elif options.operationType == OperationType.IMAGE_GENERATION and "image_generation" not in capabilities:
+ continue
+ elif options.operationType == OperationType.WEB_RESEARCH and "web_search" not in capabilities:
+ continue
+ elif "text_generation" not in capabilities:
+ continue
+
+ # Prefer models that can handle the content without chunking, but allow chunking if needed
+ if context_length >= total_size * 0.8: # 80% of content size
+ if context_length > best_context_length:
+ best_model = model_info
+ best_context_length = context_length
+ elif best_model is None: # Fallback to largest available model
+ if context_length > best_context_length:
+ best_model = model_info
+ best_context_length = context_length
+
+ # Fallback to a reasonable default if no model found
+ if best_model is None:
+ best_model = {
+ "contextLength": 128000, # GPT-4o default
+ "llmName": "gpt-4o"
+ }
+
+ # Calculate appropriate sizes
+ # Convert tokens to bytes (rough estimate: 1 token ≈ 4 characters)
+ context_length_bytes = int(best_model["contextLength"] * 4)
+ max_context_bytes = int(context_length_bytes * 0.9) # 90% of context length
+ text_chunk_size = int(max_context_bytes * 0.7) # 70% of max context for text chunks
+ image_chunk_size = int(max_context_bytes * 0.8) # 80% of max context for image chunks
+
+ logger.debug(f"Selected model: {best_model.get('llmName', 'unknown')} with context length: {best_model['contextLength']}")
+ logger.debug(f"Content size: {total_size} bytes, Max context: {max_context_bytes} bytes")
+ logger.debug(f"Text chunk size: {text_chunk_size} bytes, Image chunk size: {image_chunk_size} bytes")
+
+ return {
+ "maxContextBytes": max_context_bytes,
+ "textChunkSize": text_chunk_size,
+ "imageChunkSize": image_chunk_size
+ }
def _getModelsForOperation(self, operation_type: str, options: AiCallOptions) -> List[ModelCapabilities]:
"""
diff --git a/modules/services/serviceExtraction/chunking/text_chunker.py b/modules/services/serviceExtraction/chunking/text_chunker.py
index 35c75168..b5f6d582 100644
--- a/modules/services/serviceExtraction/chunking/text_chunker.py
+++ b/modules/services/serviceExtraction/chunking/text_chunker.py
@@ -7,6 +7,10 @@ from ..subRegistry import Chunker
class TextChunker(Chunker):
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
maxBytes = int(options.get("textChunkSize", 40000))
+ import logging
+ logger = logging.getLogger(__name__)
+ logger.debug(f"TextChunker: textChunkSize from options: {options.get('textChunkSize', 'NOT_FOUND')}")
+ logger.debug(f"TextChunker: using maxBytes: {maxBytes}")
chunks: List[Dict[str, Any]] = []
current: List[str] = []
size = 0
diff --git a/modules/services/serviceExtraction/formats/binary_extractor.py b/modules/services/serviceExtraction/formats/binary_extractor.py
index 1c201c36..e6667fda 100644
--- a/modules/services/serviceExtraction/formats/binary_extractor.py
+++ b/modules/services/serviceExtraction/formats/binary_extractor.py
@@ -1,7 +1,7 @@
from typing import Any, Dict, List
import base64
-from ..utils import makeId
+from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Extractor
diff --git a/modules/services/serviceExtraction/formats/csv_extractor.py b/modules/services/serviceExtraction/formats/csv_extractor.py
index db3cf969..27233979 100644
--- a/modules/services/serviceExtraction/formats/csv_extractor.py
+++ b/modules/services/serviceExtraction/formats/csv_extractor.py
@@ -1,7 +1,7 @@
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
-from ..utils import makeId
+from ..subUtils import makeId
from ..subRegistry import Extractor
diff --git a/modules/services/serviceExtraction/formats/docx_extractor.py b/modules/services/serviceExtraction/formats/docx_extractor.py
index 6cb75716..51384ffd 100644
--- a/modules/services/serviceExtraction/formats/docx_extractor.py
+++ b/modules/services/serviceExtraction/formats/docx_extractor.py
@@ -1,7 +1,7 @@
from typing import Any, Dict, List
import io
-from ..utils import makeId
+from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Extractor
diff --git a/modules/services/serviceExtraction/formats/html_extractor.py b/modules/services/serviceExtraction/formats/html_extractor.py
index 6c49c50c..09da02f4 100644
--- a/modules/services/serviceExtraction/formats/html_extractor.py
+++ b/modules/services/serviceExtraction/formats/html_extractor.py
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
from bs4 import BeautifulSoup
from modules.datamodels.datamodelExtraction import ContentPart
-from ..utils import makeId
+from ..subUtils import makeId
from ..subRegistry import Extractor
diff --git a/modules/services/serviceExtraction/formats/image_extractor.py b/modules/services/serviceExtraction/formats/image_extractor.py
index 296eb50b..22327f50 100644
--- a/modules/services/serviceExtraction/formats/image_extractor.py
+++ b/modules/services/serviceExtraction/formats/image_extractor.py
@@ -1,7 +1,7 @@
from typing import Any, Dict, List
import base64
-from ..utils import makeId
+from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Extractor
diff --git a/modules/services/serviceExtraction/formats/json_extractor.py b/modules/services/serviceExtraction/formats/json_extractor.py
index 456eb08e..86eac791 100644
--- a/modules/services/serviceExtraction/formats/json_extractor.py
+++ b/modules/services/serviceExtraction/formats/json_extractor.py
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
import json
from modules.datamodels.datamodelExtraction import ContentPart
-from ..utils import makeId
+from ..subUtils import makeId
from ..subRegistry import Extractor
diff --git a/modules/services/serviceExtraction/formats/pdf_extractor.py b/modules/services/serviceExtraction/formats/pdf_extractor.py
index 4d0d8058..59c88dc7 100644
--- a/modules/services/serviceExtraction/formats/pdf_extractor.py
+++ b/modules/services/serviceExtraction/formats/pdf_extractor.py
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
import base64
import io
-from ..utils import makeId
+from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Extractor
diff --git a/modules/services/serviceExtraction/formats/text_extractor.py b/modules/services/serviceExtraction/formats/text_extractor.py
index 5099d04c..a6d92bc1 100644
--- a/modules/services/serviceExtraction/formats/text_extractor.py
+++ b/modules/services/serviceExtraction/formats/text_extractor.py
@@ -1,7 +1,7 @@
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
-from ..utils import makeId
+from ..subUtils import makeId
from ..subRegistry import Extractor
diff --git a/modules/services/serviceExtraction/formats/xlsx_extractor.py b/modules/services/serviceExtraction/formats/xlsx_extractor.py
index 141af3db..ea6396a2 100644
--- a/modules/services/serviceExtraction/formats/xlsx_extractor.py
+++ b/modules/services/serviceExtraction/formats/xlsx_extractor.py
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
import io
from datetime import datetime
-from ..utils import makeId
+from ..subUtils import makeId
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Extractor
diff --git a/modules/services/serviceExtraction/formats/xml_extractor.py b/modules/services/serviceExtraction/formats/xml_extractor.py
index 7067924b..5aabea35 100644
--- a/modules/services/serviceExtraction/formats/xml_extractor.py
+++ b/modules/services/serviceExtraction/formats/xml_extractor.py
@@ -2,7 +2,7 @@ from typing import Any, Dict, List
import xml.etree.ElementTree as ET
from modules.datamodels.datamodelExtraction import ContentPart
-from ..utils import makeId
+from ..subUtils import makeId
from ..subRegistry import Extractor
diff --git a/modules/services/serviceExtraction/mainServiceExtraction.py b/modules/services/serviceExtraction/mainServiceExtraction.py
index 9e989376..1c63e12c 100644
--- a/modules/services/serviceExtraction/mainServiceExtraction.py
+++ b/modules/services/serviceExtraction/mainServiceExtraction.py
@@ -74,9 +74,13 @@ class ExtractionService:
# Log chunking information
chunked_parts = [p for p in ec.parts if p.metadata.get("chunk", False)]
if chunked_parts:
- logger.debug(f"Chunking: {len(chunked_parts)} parts were chunked")
+ logger.debug(f"=== CHUNKING RESULTS ===")
+ logger.debug(f"Total parts: {len(ec.parts)}")
+ logger.debug(f"Chunked parts: {len(chunked_parts)}")
for chunk in chunked_parts:
- logger.debug(f" Chunk: {chunk.label} - {len(chunk.data)} chars")
+ logger.debug(f" Chunk: {chunk.label} - {len(chunk.data)} chars (parent: {chunk.parentId})")
+ else:
+ logger.debug(f"No chunking needed - {len(ec.parts)} parts fit within size limits")
ec = applyAiIfRequested(ec, options)
results.append(ec)
diff --git a/modules/services/serviceExtraction/merging/table_merger.py b/modules/services/serviceExtraction/merging/table_merger.py
index 04be404e..4f62358c 100644
--- a/modules/services/serviceExtraction/merging/table_merger.py
+++ b/modules/services/serviceExtraction/merging/table_merger.py
@@ -1,6 +1,6 @@
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
-from ..utils import makeId
+from ..subUtils import makeId
class TableMerger:
diff --git a/modules/services/serviceExtraction/merging/text_merger.py b/modules/services/serviceExtraction/merging/text_merger.py
index bb9e850d..38f7c6f0 100644
--- a/modules/services/serviceExtraction/merging/text_merger.py
+++ b/modules/services/serviceExtraction/merging/text_merger.py
@@ -1,6 +1,6 @@
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ContentPart
-from ..utils import makeId
+from ..subUtils import makeId
class TextMerger:
diff --git a/modules/services/serviceExtraction/subPipeline.py b/modules/services/serviceExtraction/subPipeline.py
index cf12a246..65d8f1f2 100644
--- a/modules/services/serviceExtraction/subPipeline.py
+++ b/modules/services/serviceExtraction/subPipeline.py
@@ -1,7 +1,7 @@
from typing import Any, Dict, List
from modules.datamodels.datamodelExtraction import ExtractedContent, ContentPart
-from .utils import makeId
+from .subUtils import makeId
from .subRegistry import ExtractorRegistry, ChunkerRegistry
from .merging.text_merger import TextMerger
from .merging.table_merger import TableMerger
@@ -67,10 +67,29 @@ def runExtraction(extractorRegistry: ExtractorRegistry, chunkerRegistry: Chunker
return ExtractedContent(id=makeId(), parts=[part])
parts = extractor.extract(documentBytes, {"fileName": fileName, "mimeType": mimeType, "options": options})
- # Optional merge step
+
+ # Apply chunking and size limiting
+ parts = poolAndLimit(parts, chunkerRegistry, options)
+
+ # Optional merge step - but preserve chunks
mergeStrategy = options.get("mergeStrategy", {})
if mergeStrategy:
- parts = _mergeParts(parts, mergeStrategy)
+ import logging
+ logger = logging.getLogger(__name__)
+
+ # Don't merge chunks - they should stay separate for processing
+ non_chunk_parts = [p for p in parts if not p.metadata.get("chunk", False)]
+ chunk_parts = [p for p in parts if p.metadata.get("chunk", False)]
+
+ logger.debug(f"runExtraction: Preserving {len(chunk_parts)} chunks from merging")
+
+ if non_chunk_parts:
+ non_chunk_parts = _mergeParts(non_chunk_parts, mergeStrategy)
+
+ # Combine non-chunk parts with chunk parts (chunks stay separate)
+ parts = non_chunk_parts + chunk_parts
+
+ logger.debug(f"runExtraction: Final parts after merging: {len(parts)} (chunks: {len(chunk_parts)})")
return ExtractedContent(id=makeId(), parts=parts)
@@ -100,32 +119,56 @@ def poolAndLimit(parts: List[ContentPart], chunkerRegistry: ChunkerRegistry, opt
# If we have remaining parts and chunking is allowed, try chunking
if remaining and chunkAllowed:
+ import logging
+ logger = logging.getLogger(__name__)
+ logger.debug(f"=== CHUNKING ACTIVATED ===")
+ logger.debug(f"Remaining parts to chunk: {len(remaining)}")
+ logger.debug(f"Max size limit: {maxSize} bytes")
+ logger.debug(f"Current size used: {current} bytes")
+
for p in remaining:
if p.typeGroup in ("text", "table", "structure", "image"):
+ logger.debug(f"Chunking {p.typeGroup} part: {len(p.data)} chars")
chunks = chunkerRegistry.resolve(p.typeGroup).chunk(p, options)
+ logger.debug(f"Created {len(chunks)} chunks")
+
+ chunks_added = 0
for ch in chunks:
chSize = int(ch.get("size", 0) or 0)
- if current + chSize <= maxSize:
- kept.append(ContentPart(
- id=makeId(),
- parentId=p.id,
- label=f"chunk_{ch.get('order', 0)}",
- typeGroup=p.typeGroup,
- mimeType=p.mimeType,
- data=ch.get("data", ""),
- metadata={
- "size": chSize,
- "chunk": True,
- **ch.get("metadata", {})
- }
- ))
- current += chSize
- else:
- break
+ # Add all chunks - don't limit by maxSize since they'll be processed separately
+ kept.append(ContentPart(
+ id=makeId(),
+ parentId=p.id,
+ label=f"chunk_{ch.get('order', 0)}",
+ typeGroup=p.typeGroup,
+ mimeType=p.mimeType,
+ data=ch.get("data", ""),
+ metadata={
+ "size": chSize,
+ "chunk": True,
+ **ch.get("metadata", {})
+ }
+ ))
+ chunks_added += 1
+ logger.debug(f"Added chunk {ch.get('order', 0)}: {chSize} bytes")
+
+ logger.debug(f"Added {chunks_added} chunks from {p.typeGroup} part")
- # Apply merging strategy if provided
+ # Apply merging strategy if provided, but preserve chunks
if mergeStrategy:
- kept = _applyMerging(kept, mergeStrategy)
+ # Don't merge chunks - they should stay separate for processing
+ non_chunk_parts = [p for p in kept if not p.metadata.get("chunk", False)]
+ chunk_parts = [p for p in kept if p.metadata.get("chunk", False)]
+
+ logger.debug(f"Preserving {len(chunk_parts)} chunks from merging")
+
+ if non_chunk_parts:
+ non_chunk_parts = _applyMerging(non_chunk_parts, mergeStrategy)
+
+ # Combine non-chunk parts with chunk parts (chunks stay separate)
+ kept = non_chunk_parts + chunk_parts
+
+ logger.debug(f"Final parts after merging: {len(kept)} (chunks: {len(chunk_parts)})")
# Re-check size after merging
totalSize = sum(int(p.metadata.get("size", 0) or 0) for p in kept)
diff --git a/modules/services/serviceExtraction/utils/__init__.py b/modules/services/serviceExtraction/subUtils.py
similarity index 96%
rename from modules/services/serviceExtraction/utils/__init__.py
rename to modules/services/serviceExtraction/subUtils.py
index a16d3f59..efee532b 100644
--- a/modules/services/serviceExtraction/utils/__init__.py
+++ b/modules/services/serviceExtraction/subUtils.py
@@ -3,5 +3,3 @@ import uuid
def makeId() -> str:
return str(uuid.uuid4())
-
-
diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py
index f18f071b..14bfe7fe 100644
--- a/modules/services/serviceGeneration/mainServiceGeneration.py
+++ b/modules/services/serviceGeneration/mainServiceGeneration.py
@@ -257,4 +257,82 @@ class GenerationService:
'totalActions': 0,
'workflowStatus': 'unknown',
'workflowId': 'unknown'
- }
\ No newline at end of file
+ }
+
+ async def renderReport(self, extracted_content: str, output_format: str, title: str) -> tuple[str, str]:
+ """
+ Render extracted content to the specified output format.
+
+ Args:
+ extracted_content: Content extracted by AI using format-specific prompt
+ output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
+ title: Report title
+
+ Returns:
+ tuple: (rendered_content, mime_type)
+ """
+ try:
+ # Get the appropriate renderer for the format
+ renderer = self._getFormatRenderer(output_format)
+ if not renderer:
+ raise ValueError(f"Unsupported output format: {output_format}")
+
+ # Render the content
+ rendered_content, mime_type = await renderer.render(extracted_content, title)
+
+ logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters")
+ return rendered_content, mime_type
+
+ except Exception as e:
+ logger.error(f"Error rendering report to {output_format}: {str(e)}")
+ raise
+
+ def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str:
+ """
+ Get the format-specific extraction prompt for AI content extraction.
+
+ Args:
+ output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
+ user_prompt: User's original prompt for report generation
+ title: Report title
+
+ Returns:
+ str: Format-specific prompt for AI extraction
+ """
+ try:
+ # Get the appropriate renderer for the format
+ renderer = self._getFormatRenderer(output_format)
+ if not renderer:
+ raise ValueError(f"Unsupported output format: {output_format}")
+
+ # Get the format-specific extraction prompt
+ extraction_prompt = renderer.getExtractionPrompt(user_prompt, title)
+
+ logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters")
+ return extraction_prompt
+
+ except Exception as e:
+ logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}")
+ raise
+
+ def _getFormatRenderer(self, output_format: str):
+ """Get the appropriate renderer for the specified format using auto-discovery."""
+ try:
+ from .renderers.registry import get_renderer
+ renderer = get_renderer(output_format)
+
+ if renderer:
+ return renderer
+
+ # Fallback to text renderer if no specific renderer found
+ logger.warning(f"No renderer found for format {output_format}, falling back to text")
+ fallback_renderer = get_renderer('text')
+ if fallback_renderer:
+ return fallback_renderer
+
+ logger.error("Even text renderer fallback failed")
+ return None
+
+ except Exception as e:
+ logger.error(f"Error getting renderer for {output_format}: {str(e)}")
+ return None
\ No newline at end of file
diff --git a/modules/services/serviceGeneration/renderers/base_renderer.py b/modules/services/serviceGeneration/renderers/base_renderer.py
new file mode 100644
index 00000000..dd91be09
--- /dev/null
+++ b/modules/services/serviceGeneration/renderers/base_renderer.py
@@ -0,0 +1,86 @@
+"""
+Base renderer class for all format renderers.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Dict, Any, Tuple, List
+import logging
+
+logger = logging.getLogger(__name__)
+
+class BaseRenderer(ABC):
+ """Base class for all format renderers."""
+
+ def __init__(self):
+ self.logger = logger
+
+ @classmethod
+ def get_supported_formats(cls) -> List[str]:
+ """
+ Return list of supported format names for this renderer.
+ Override this method in subclasses to specify supported formats.
+ """
+ return []
+
+ @classmethod
+ def get_format_aliases(cls) -> List[str]:
+ """
+ Return list of format aliases for this renderer.
+ Override this method in subclasses to specify format aliases.
+ """
+ return []
+
+ @classmethod
+ def get_priority(cls) -> int:
+ """
+ Return priority for this renderer (higher number = higher priority).
+ Used when multiple renderers support the same format.
+ """
+ return 0
+
+ @abstractmethod
+ def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
+ """
+ Get the format-specific extraction prompt for AI content extraction.
+
+ Args:
+ user_prompt: User's original prompt for report generation
+ title: Report title
+
+ Returns:
+ str: Format-specific prompt for AI extraction
+ """
+ pass
+
+ @abstractmethod
+ async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
+ """
+ Render extracted content to the target format.
+
+ Args:
+ extracted_content: Raw content extracted by AI using format-specific prompt
+ title: Report title
+
+ Returns:
+ tuple: (rendered_content, mime_type)
+ """
+ pass
+
+ def _extract_sections(self, report_data: Dict[str, Any]) -> list:
+ """Extract sections from report data."""
+ return report_data.get('sections', [])
+
+ def _extract_metadata(self, report_data: Dict[str, Any]) -> Dict[str, Any]:
+ """Extract metadata from report data."""
+ return report_data.get('metadata', {})
+
+ def _get_title(self, report_data: Dict[str, Any], fallback_title: str) -> str:
+ """Get title from report data or use fallback."""
+ return report_data.get('title', fallback_title)
+
+ def _format_timestamp(self, timestamp: str = None) -> str:
+ """Format timestamp for display."""
+ if timestamp:
+ return timestamp
+ from datetime import datetime, UTC
+ return datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S UTC")
diff --git a/modules/services/serviceGeneration/renderers/csv_renderer.py b/modules/services/serviceGeneration/renderers/csv_renderer.py
new file mode 100644
index 00000000..8e2344ac
--- /dev/null
+++ b/modules/services/serviceGeneration/renderers/csv_renderer.py
@@ -0,0 +1,90 @@
+"""
+CSV renderer for report generation.
+"""
+
+from .base_renderer import BaseRenderer
+from typing import Dict, Any, Tuple, List
+import csv
+import io
+
+class CsvRenderer(BaseRenderer):
+ """Renders content to CSV format with format-specific extraction."""
+
+ @classmethod
+ def get_supported_formats(cls) -> List[str]:
+ """Return supported CSV formats."""
+ return ['csv']
+
+ @classmethod
+ def get_format_aliases(cls) -> List[str]:
+ """Return format aliases."""
+ return ['spreadsheet', 'table']
+
+ @classmethod
+ def get_priority(cls) -> int:
+ """Return priority for CSV renderer."""
+ return 70
+
+ def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
+ """Get CSV-specific extraction prompt."""
+ return f"""
+{user_prompt}
+
+Generate a comprehensive CSV report with the title: "{title}"
+
+CSV FORMAT REQUIREMENTS:
+- Create structured data in CSV format
+- Use proper CSV syntax with commas and quotes
+- Include headers for all columns
+- Structure data in rows and columns
+- Include source document information
+- Add metadata as additional rows
+
+CSV STRUCTURE:
+- First row: Headers (Section, Type, Heading, Content, Source)
+- Data rows: One per section/item
+- Use quotes around content that contains commas
+- Escape quotes properly
+- Include metadata rows at the end
+
+FORMATTING RULES:
+- Headers: Section, Type, Heading, Content, Source
+- Content: Escape commas and quotes, limit length
+- Source: Include document name and page if available
+- Metadata: Add special rows for generation info
+
+OUTPUT POLICY:
+- Return ONLY CSV data
+- No markdown, no code blocks, no additional text
+- Properly formatted CSV
+- Include all necessary information
+- Valid CSV that can be imported
+
+Generate the complete CSV report:
+"""
+
+ async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
+ """Render extracted content to CSV format."""
+ try:
+ # The extracted content should already be CSV from the AI
+ # Just clean it up
+ csv_content = self._clean_csv_content(extracted_content, title)
+
+ return csv_content, "text/csv"
+
+ except Exception as e:
+ self.logger.error(f"Error rendering CSV: {str(e)}")
+ # Return minimal CSV fallback
+ return f"Title,Content\n{title},Error rendering report: {str(e)}", "text/csv"
+
+ def _clean_csv_content(self, content: str, title: str) -> str:
+ """Clean and validate CSV content from AI."""
+ content = content.strip()
+
+ # Remove markdown code blocks if present
+ if content.startswith("```") and content.endswith("```"):
+ lines = content.split('\n')
+ if len(lines) > 2:
+ content = '\n'.join(lines[1:-1]).strip()
+
+ return content
diff --git a/modules/services/serviceGeneration/renderers/docx_renderer.py b/modules/services/serviceGeneration/renderers/docx_renderer.py
new file mode 100644
index 00000000..e2ea7f3f
--- /dev/null
+++ b/modules/services/serviceGeneration/renderers/docx_renderer.py
@@ -0,0 +1,291 @@
+"""
+DOCX renderer for report generation using python-docx.
+"""
+
+from .base_renderer import BaseRenderer
+from typing import Dict, Any, Tuple, List
+import io
+import base64
+from datetime import datetime, UTC
+
+try:
+ from docx import Document
+ from docx.shared import Inches, Pt
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
+ from docx.enum.table import WD_TABLE_ALIGNMENT
+ from docx.oxml.shared import OxmlElement, qn
+ from docx.oxml.ns import nsdecls
+ from docx.oxml import parse_xml
+ DOCX_AVAILABLE = True
+except ImportError:
+ DOCX_AVAILABLE = False
+
+class DocxRenderer(BaseRenderer):
+ """Renders content to DOCX format using python-docx."""
+
+ @classmethod
+ def get_supported_formats(cls) -> List[str]:
+ """Return supported DOCX formats."""
+ return ['docx', 'doc']
+
+ @classmethod
+ def get_format_aliases(cls) -> List[str]:
+ """Return format aliases."""
+ return ['word', 'document']
+
+ @classmethod
+ def get_priority(cls) -> int:
+ """Return priority for DOCX renderer."""
+ return 115
+
+ def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
+ """Get DOCX-specific extraction prompt."""
+ return f"""
+{user_prompt}
+
+Generate a comprehensive DOCX report with the title: "{title}"
+
+DOCX FORMAT REQUIREMENTS:
+- Create structured content suitable for Word documents
+- Use clear headings and sections with proper hierarchy
+- Include tables for structured data
+- Use bullet points and numbered lists where appropriate
+- Include source document information
+- Structure content for professional presentation
+- Use consistent formatting throughout
+
+DOCX STRUCTURE:
+- Title page with report title and generation date
+- Table of contents (if multiple sections)
+- Executive summary
+- Main content sections with clear headings
+- Data tables and analysis
+- Conclusions and recommendations
+- Appendices with source information
+
+FORMATTING RULES:
+- Use clear section headings (H1, H2, H3 style)
+- Include consistent paragraph formatting
+- Use tables with proper alignment and borders
+- Use bullet points and numbered lists
+- Add source citations and references
+- Include generation metadata
+- Use professional fonts and spacing
+
+OUTPUT POLICY:
+- Return ONLY plain text content suitable for Word document generation
+- NO markdown formatting (no **bold**, no # headings, no --- separators)
+- NO HTML tags
+- NO code blocks
+- Use plain text with clear structure
+- Use line breaks for separation
+- Use indentation for lists
+- Use ALL CAPS for major headings
+- Use Title Case for subheadings
+- Use bullet points with dashes (-) for lists
+- Use numbers (1., 2., 3.) for numbered lists
+- Professional document format
+- Include all necessary information
+
+Generate the complete DOCX report content:
+"""
+
+ async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
+ """Render extracted content to DOCX format."""
+ try:
+ if not DOCX_AVAILABLE:
+ # Fallback to HTML if python-docx not available
+ from .html_renderer import HtmlRenderer
+ html_renderer = HtmlRenderer()
+ html_content, _ = await html_renderer.render(extracted_content, title)
+ return html_content, "text/html"
+
+ # Generate DOCX using python-docx
+ docx_content = self._generate_docx(extracted_content, title)
+
+ return docx_content, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+
+ except Exception as e:
+ self.logger.error(f"Error rendering DOCX: {str(e)}")
+ # Return minimal fallback
+ return f"DOCX Generation Error: {str(e)}", "text/plain"
+
+ def _generate_docx(self, content: str, title: str) -> str:
+ """Generate DOCX content using python-docx."""
+ try:
+ # Create new document
+ doc = Document()
+
+ # Set up document styles
+ self._setup_document_styles(doc)
+
+ # Add title
+ title_para = doc.add_heading(title, 0)
+ title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+
+ # Add generation date
+ date_para = doc.add_paragraph(f"Generated: {self._format_timestamp()}")
+ date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+
+ # Add page break
+ doc.add_page_break()
+
+ # Process content
+ lines = content.split('\n')
+ current_section = []
+
+ for line in lines:
+ line = line.strip()
+ if not line:
+ continue
+
+ # Check for ALL CAPS headings (major headings)
+ if line.isupper() and len(line) > 3 and not line.startswith('-') and not line.startswith('*'):
+ if current_section:
+ self._process_section(doc, current_section)
+ current_section = []
+ doc.add_heading(line, level=1)
+ # Check for Title Case headings (subheadings)
+ elif line.istitle() and len(line) > 5 and not line.startswith('-') and not line.startswith('*') and not line.startswith(('1.', '2.', '3.', '4.', '5.')):
+ if current_section:
+ self._process_section(doc, current_section)
+ current_section = []
+ doc.add_heading(line, level=2)
+ # Check for markdown headings (fallback)
+ elif line.startswith('# '):
+ # H1 heading
+ if current_section:
+ self._process_section(doc, current_section)
+ current_section = []
+ doc.add_heading(line[2:], level=1)
+ elif line.startswith('## '):
+ # H2 heading
+ if current_section:
+ self._process_section(doc, current_section)
+ current_section = []
+ doc.add_heading(line[3:], level=2)
+ elif line.startswith('### '):
+ # H3 heading
+ if current_section:
+ self._process_section(doc, current_section)
+ current_section = []
+ doc.add_heading(line[4:], level=3)
+ else:
+ current_section.append(line)
+
+ # Process remaining content
+ if current_section:
+ self._process_section(doc, current_section)
+
+ # Save to buffer
+ buffer = io.BytesIO()
+ doc.save(buffer)
+ buffer.seek(0)
+
+ # Convert to base64
+ docx_bytes = buffer.getvalue()
+ docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')
+
+ return docx_base64
+
+ except Exception as e:
+ self.logger.error(f"Error generating DOCX: {str(e)}")
+ raise
+
+ def _setup_document_styles(self, doc):
+ """Set up document styles."""
+ try:
+ # Set default font
+ style = doc.styles['Normal']
+ font = style.font
+ font.name = 'Calibri'
+ font.size = Pt(11)
+
+ # Set heading styles
+ for i in range(1, 4):
+ heading_style = doc.styles[f'Heading {i}']
+ heading_font = heading_style.font
+ heading_font.name = 'Calibri'
+ heading_font.size = Pt(16 - i * 2)
+ heading_font.bold = True
+ except Exception as e:
+ self.logger.warning(f"Could not set up document styles: {str(e)}")
+
+ def _process_section(self, doc, lines: list):
+ """Process a section of content into DOCX elements."""
+ for line in lines:
+ if not line.strip():
+ continue
+
+ # Check for tables (lines with |)
+ if '|' in line and not line.startswith('|'):
+ # This might be part of a table, process as table
+ table_data = self._extract_table_data(lines)
+ if table_data:
+ self._add_table(doc, table_data)
+ return
+
+ # Check for lists
+ if line.startswith('- ') or line.startswith('* '):
+ # This is a list item
+ doc.add_paragraph(line[2:], style='List Bullet')
+ elif line.startswith(('1. ', '2. ', '3. ', '4. ', '5. ')):
+ # This is a numbered list item
+ doc.add_paragraph(line[3:], style='List Number')
+ else:
+ # Regular paragraph
+ doc.add_paragraph(line)
+
+ def _extract_table_data(self, lines: list) -> list:
+ """Extract table data from lines."""
+ table_data = []
+ in_table = False
+
+ for line in lines:
+ if '|' in line:
+ if not in_table:
+ in_table = True
+ # Split by | and clean up
+ cells = [cell.strip() for cell in line.split('|') if cell.strip()]
+ if cells:
+ table_data.append(cells)
+ elif in_table and not line.strip():
+ # Empty line, might be end of table
+ break
+
+ return table_data if len(table_data) > 1 else []
+
+ def _add_table(self, doc, table_data: list):
+ """Add a table to the document."""
+ try:
+ if not table_data:
+ return
+
+ # Create table
+ table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
+ table.alignment = WD_TABLE_ALIGNMENT.CENTER
+
+ # Add data to table
+ for row_idx, row_data in enumerate(table_data):
+ for col_idx, cell_data in enumerate(row_data):
+ if col_idx < len(table.rows[row_idx].cells):
+ table.rows[row_idx].cells[col_idx].text = cell_data
+
+ # Style the table
+ self._style_table(table)
+
+ except Exception as e:
+ self.logger.warning(f"Could not add table: {str(e)}")
+
+ def _style_table(self, table):
+ """Apply styling to the table."""
+ try:
+ # Style header row
+ if len(table.rows) > 0:
+ header_cells = table.rows[0].cells
+ for cell in header_cells:
+ for paragraph in cell.paragraphs:
+ for run in paragraph.runs:
+ run.bold = True
+ except Exception as e:
+ self.logger.warning(f"Could not style table: {str(e)}")
\ No newline at end of file
diff --git a/modules/services/serviceGeneration/renderers/excel_renderer.py b/modules/services/serviceGeneration/renderers/excel_renderer.py
new file mode 100644
index 00000000..e22a5b5f
--- /dev/null
+++ b/modules/services/serviceGeneration/renderers/excel_renderer.py
@@ -0,0 +1,264 @@
+"""
+Excel renderer for report generation using openpyxl.
+"""
+
+from .base_renderer import BaseRenderer
+from typing import Dict, Any, Tuple, List
+import io
+import base64
+from datetime import datetime, UTC
+
+try:
+ from openpyxl import Workbook
+ from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
+ from openpyxl.utils import get_column_letter
+ from openpyxl.worksheet.table import Table, TableStyleInfo
+ OPENPYXL_AVAILABLE = True
+except ImportError:
+ OPENPYXL_AVAILABLE = False
+
+class ExcelRenderer(BaseRenderer):
+ """Renders content to Excel format using openpyxl."""
+
+ @classmethod
+ def get_supported_formats(cls) -> List[str]:
+ """Return supported Excel formats."""
+ return ['xlsx', 'xls', 'excel']
+
+ @classmethod
+ def get_format_aliases(cls) -> List[str]:
+ """Return format aliases."""
+ return ['spreadsheet', 'workbook']
+
+ @classmethod
+ def get_priority(cls) -> int:
+ """Return priority for Excel renderer."""
+ return 110
+
+ def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
+ """Get Excel-specific extraction prompt."""
+ return f"""
+{user_prompt}
+
+Generate a comprehensive Excel report with the title: "{title}"
+
+EXCEL FORMAT REQUIREMENTS:
+- Create structured data suitable for Excel spreadsheets
+- Use clear column headers and organized rows
+- Include multiple sheets if needed (Summary, Data, Analysis, etc.)
+- Use proper data types (text, numbers, dates)
+- Include formulas where appropriate
+- Structure data in tables with clear headers
+- Include source document information
+- Add metadata and generation information
+
+EXCEL STRUCTURE:
+- Sheet 1: Summary/Overview with key metrics
+- Sheet 2: Detailed data in tabular format
+- Sheet 3: Analysis and insights
+- Use proper column headers (A, B, C, etc.)
+- Include data validation and formatting hints
+- Add comments for complex data
+
+FORMATTING RULES:
+- Headers: Use bold formatting, clear column names
+- Data: Organize in rows and columns, consistent formatting
+- Numbers: Use proper number formatting (currency, percentages, etc.)
+- Dates: Use standard date format (YYYY-MM-DD)
+- Text: Left-aligned, wrap long text
+- Formulas: Use Excel formula syntax (=SUM, =AVERAGE, etc.)
+- Colors: Use conditional formatting for highlights
+
+SHEET STRUCTURE:
+Sheet 1 - Summary:
+- Report Title
+- Key Metrics (counts, totals, averages)
+- Executive Summary
+- Generation Date
+
+Sheet 2 - Data:
+- Column A: Item/Category
+- Column B: Value/Amount
+- Column C: Percentage
+- Column D: Source Document
+- Column E: Notes/Comments
+
+Sheet 3 - Analysis:
+- Trends and patterns
+- Comparisons
+- Recommendations
+- Charts descriptions
+
+OUTPUT POLICY:
+- Return ONLY Excel-compatible data
+- No HTML, no markdown, no code blocks
+- Structured data that can be imported to Excel
+- Include sheet names and structure
+- Professional spreadsheet format
+- Include all necessary information
+
+Generate the complete Excel report data:
+"""
+
+ async def render(self, extracted_content: str, title: str) -> Tuple[str, str]:
+ """Render extracted content to Excel format."""
+ try:
+ if not OPENPYXL_AVAILABLE:
+ # Fallback to CSV if openpyxl not available
+ from .csv_renderer import CsvRenderer
+ csv_renderer = CsvRenderer()
+ csv_content, _ = await csv_renderer.render(extracted_content, title)
+ return csv_content, "text/csv"
+
+ # Generate Excel using openpyxl
+ excel_content = self._generate_excel(extracted_content, title)
+
+ return excel_content, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+
+ except Exception as e:
+ self.logger.error(f"Error rendering Excel: {str(e)}")
+ # Return CSV fallback
+ return f"Title,Content\n{title},Error rendering Excel report: {str(e)}", "text/csv"
+
+ def _generate_excel(self, content: str, title: str) -> str:
+ """Generate Excel content using openpyxl."""
+ try:
+ # Create workbook
+ wb = Workbook()
+
+ # Remove default sheet
+ wb.remove(wb.active)
+
+ # Create sheets
+ summary_sheet = wb.create_sheet("Summary", 0)
+ data_sheet = wb.create_sheet("Data", 1)
+ analysis_sheet = wb.create_sheet("Analysis", 2)
+
+ # Add content to sheets
+ self._populate_summary_sheet(summary_sheet, title)
+ self._populate_data_sheet(data_sheet, content)
+ self._populate_analysis_sheet(analysis_sheet, content)
+
+ # Save to buffer
+ buffer = io.BytesIO()
+ wb.save(buffer)
+ buffer.seek(0)
+
+ # Convert to base64
+ excel_bytes = buffer.getvalue()
+ excel_base64 = base64.b64encode(excel_bytes).decode('utf-8')
+
+ return excel_base64
+
+ except Exception as e:
+ self.logger.error(f"Error generating Excel: {str(e)}")
+ raise
+
+ def _populate_summary_sheet(self, sheet, title: str):
+ """Populate the summary sheet."""
+ try:
+ # Title
+ sheet['A1'] = title
+ sheet['A1'].font = Font(size=16, bold=True)
+ sheet['A1'].alignment = Alignment(horizontal='center')
+
+ # Generation info
+ sheet['A3'] = "Generated:"
+ sheet['B3'] = self._format_timestamp()
+ sheet['A4'] = "Status:"
+ sheet['B4'] = "Generated Successfully"
+
+ # Key metrics placeholder
+ sheet['A6'] = "Key Metrics:"
+ sheet['A6'].font = Font(bold=True)
+ sheet['A7'] = "Total Items:"
+ sheet['B7'] = "=COUNTA(Data!A:A)-1" # Count non-empty cells in Data sheet
+
+ # Auto-adjust column widths
+ sheet.column_dimensions['A'].width = 20
+ sheet.column_dimensions['B'].width = 30
+
+ except Exception as e:
+ self.logger.warning(f"Could not populate summary sheet: {str(e)}")
+
+ def _populate_data_sheet(self, sheet, content: str):
+ """Populate the data sheet."""
+ try:
+ # Headers
+ headers = ["Item/Category", "Value/Amount", "Percentage", "Source Document", "Notes/Comments"]
+ for col, header in enumerate(headers, 1):
+ cell = sheet.cell(row=1, column=col, value=header)
+ cell.font = Font(bold=True)
+ cell.fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid")
+
+ # Process content
+ lines = content.split('\n')
+ row = 2
+
+ for line in lines:
+ line = line.strip()
+ if not line:
+ continue
+
+ # Check for table data (lines with |)
+ if '|' in line:
+ cells = [cell.strip() for cell in line.split('|') if cell.strip()]
+ for col, cell_data in enumerate(cells[:5], 1): # Limit to 5 columns
+ sheet.cell(row=row, column=col, value=cell_data)
+ row += 1
+ else:
+ # Regular content
+ sheet.cell(row=row, column=1, value=line)
+ row += 1
+
+ # Auto-adjust column widths
+ for col in range(1, 6):
+ sheet.column_dimensions[get_column_letter(col)].width = 20
+
+ except Exception as e:
+ self.logger.warning(f"Could not populate data sheet: {str(e)}")
+
+ def _populate_analysis_sheet(self, sheet, content: str):
+ """Populate the analysis sheet."""
+ try:
+ # Title
+ sheet['A1'] = "Analysis & Insights"
+ sheet['A1'].font = Font(size=14, bold=True)
+
+ # Content analysis
+ lines = content.split('\n')
+ row = 3
+
+ sheet['A3'] = "Content Analysis:"
+ sheet['A3'].font = Font(bold=True)
+ row += 1
+
+ # Count different types of content
+ table_lines = sum(1 for line in lines if '|' in line)
+ list_lines = sum(1 for line in lines if line.startswith(('- ', '* ')))
+ text_lines = len(lines) - table_lines - list_lines
+
+ sheet[f'A{row}'] = f"Total Lines: {len(lines)}"
+ row += 1
+ sheet[f'A{row}'] = f"Table Rows: {table_lines}"
+ row += 1
+ sheet[f'A{row}'] = f"List Items: {list_lines}"
+ row += 1
+ sheet[f'A{row}'] = f"Text Lines: {text_lines}"
+ row += 2
+
+ # Recommendations
+ sheet[f'A{row}'] = "Recommendations:"
+ sheet[f'A{row}'].font = Font(bold=True)
+ row += 1
+ sheet[f'A{row}'] = "1. Review data accuracy"
+ row += 1
+ sheet[f'A{row}'] = "2. Consider additional analysis"
+ row += 1
+ sheet[f'A{row}'] = "3. Update regularly"
+
+ # Auto-adjust column width
+ sheet.column_dimensions['A'].width = 30
+
+ except Exception as e:
+ self.logger.warning(f"Could not populate analysis sheet: {str(e)}")
diff --git a/modules/services/serviceGeneration/renderers/html_renderer.py b/modules/services/serviceGeneration/renderers/html_renderer.py
new file mode 100644
index 00000000..5d3c886a
--- /dev/null
+++ b/modules/services/serviceGeneration/renderers/html_renderer.py
@@ -0,0 +1,94 @@
+"""
+HTML renderer for report generation.
+"""
+
+from .base_renderer import BaseRenderer
+from typing import Dict, Any, Tuple, List
+
+class HtmlRenderer(BaseRenderer):
+ """Renders content to HTML format with format-specific extraction."""
+
+ @classmethod
+ def get_supported_formats(cls) -> List[str]:
+ """Return supported HTML formats."""
+ return ['html', 'htm']
+
+ @classmethod
+ def get_format_aliases(cls) -> List[str]:
+ """Return format aliases."""
+ return ['web', 'webpage']
+
+ @classmethod
+ def get_priority(cls) -> int:
+ """Return priority for HTML renderer."""
+ return 100
+
+ def getExtractionPrompt(self, user_prompt: str, title: str) -> str:
+ """Get HTML-specific extraction prompt."""
+ return f"""
+{user_prompt}
+
+Generate a comprehensive HTML report with the title: "{title}"
+
+HTML STRUCTURE REQUIREMENTS:
+- Create a complete, self-contained HTML document
+- Start with:
+- Include: , (with and ), and
+- Use proper HTML5 semantic elements: , , , ,