chore: add multi table support

This commit is contained in:
Christopher Gondek 2025-10-13 14:26:31 +02:00
parent 97c503e727
commit 70c5f3a6b0
5 changed files with 132 additions and 69 deletions

View file

@ -142,22 +142,31 @@ REGISTRY: Dict[
@dataclass
class Preprocessor:
class TableConfig:
"""Configuration for a single table."""
name: str
powerbi_table_name: str
steps: List[Dict[str, Any]] = field(default_factory=list)
@dataclass
class Preprocessor:
table_configs: List[TableConfig] = field(default_factory=list)
last_report: List[str] = field(default_factory=list)
@classmethod
async def create(cls, config_path: str) -> "Preprocessor":
"""Create a Preprocessor instance from a YAML configuration file.
Loads preprocessing steps from a YAML configuration file and creates
a new Preprocessor instance with those steps.
Loads table configurations from a YAML configuration file and creates
a new Preprocessor instance with those configurations.
Args:
config_path: Path to the YAML configuration file containing preprocessing steps.
config_path: Path to the YAML configuration file containing table configurations.
Returns:
A new Preprocessor instance configured with the steps from the file.
A new Preprocessor instance configured with tables from the file.
Raises:
FileNotFoundError: If the configuration file does not exist.
@ -168,10 +177,37 @@ class Preprocessor:
"""
path = Path(config_path)
cfg = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
return cls(steps=cfg.get("steps", []))
async def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
"""Apply all configured preprocessing steps to a DataFrame.
# Parse table configurations
table_configs = []
for table_data in cfg.get("tables", []):
table_config = TableConfig(
name=table_data.get("name", ""),
powerbi_table_name=table_data.get("powerbi_table_name", ""),
steps=table_data.get("steps", []),
)
table_configs.append(table_config)
return cls(table_configs=table_configs)
def get_table_configs(self) -> List[TableConfig]:
"""Get all table configurations.
Returns:
List of TableConfig objects containing table configurations.
Example:
>>> preprocessor = await Preprocessor.create("config.yaml")
>>> configs = preprocessor.get_table_configs()
>>> for config in configs:
... print(config.name, config.powerbi_table_name)
"""
return self.table_configs
async def preprocess(
self, df: pd.DataFrame, *, steps: List[Dict[str, Any]]
) -> pd.DataFrame:
"""Apply preprocessing steps to a DataFrame.
Executes each preprocessing step in sequence on the provided DataFrame.
Each step is looked up in the REGISTRY and applied with its parameters.
@ -180,18 +216,20 @@ class Preprocessor:
Args:
df: The input DataFrame to preprocess.
steps: List of preprocessing steps to apply.
Returns:
The preprocessed DataFrame after applying all configured steps.
Example:
>>> preprocessor = await Preprocessor.create("config.yaml")
>>> table_config = preprocessor.get_table_configs()[0]
>>> df = pd.DataFrame({'A': [1, 2, None], 'B': ['x', 'y', 'z']})
>>> result = await preprocessor.preprocess(df)
>>> result = await preprocessor.preprocess(df, steps=table_config.steps)
>>> print(preprocessor.last_report) # Check for any warnings
"""
report: List[str] = []
for step in self.steps:
for step in steps:
name, params = next(iter(step.items()))
fn = REGISTRY.get(name)
if not fn:

View file

@ -12,33 +12,28 @@ from src.dataprocessor.domain.sqlite_datasaver import SQLiteDataSaver
class DataProcessorService:
"""Service class for data processing operations."""
power_bi_reader: PowerBIReader = None
preprocessor: Preprocessor = None
data_saver: BaseDataSaver = None
access_token: str = None
@classmethod
async def create(cls) -> "DataProcessorService":
"""Create a new instance of DataProcessorService."""
instance = cls()
instance.power_bi_reader = await cls._create_powerbi_reader()
instance.access_token = await cls._get_access_token()
instance.preprocessor = await cls._create_preprocessor()
instance.data_saver = await SQLiteDataSaver.create(settings.DB_PATH)
return instance
@staticmethod
async def _create_powerbi_reader() -> PowerBIReader:
"""Create and initialize the PowerBIReader."""
async def _get_access_token() -> str:
"""Get Power BI access token."""
access_token = await PowerBIReader._get_access_token_async(
tenant_id=settings.POWERBI_TENANT_ID,
client_id=settings.POWERBI_CLIENT_ID,
client_secret=settings.POWERBI_CLIENT_SECRET,
)
power_bi_reader = await PowerBIReader.create(
dataset_id=settings.POWERBI_DATASET_ID,
access_token=access_token,
table_name=settings.POWERBI_TABLE_NAME,
)
return power_bi_reader
return access_token
@staticmethod
async def _create_preprocessor() -> Preprocessor:
@ -47,16 +42,33 @@ class DataProcessorService:
return preprocessor
async def update_database(self):
"""Placeholder method for updating the database."""
# Step 1: Read data from Power BI
df = await self.power_bi_reader.read_data()
if df.empty:
raise RuntimeError("No data read from Power BI.")
# Step 2: Preprocess the data
df = await self.preprocessor.preprocess(df)
if df.empty:
raise RuntimeError("No data returned from preprocessing.")
# Step 3: Update the local SQLite database
await self.data_saver.save_table(
df, self.power_bi_reader.table_name, overwrite=True
)
"""Update the database by processing all configured tables."""
table_configs = self.preprocessor.get_table_configs()
if not table_configs:
raise RuntimeError("No table configurations found in preprocessing config.")
for table_config in table_configs:
# Step 1: Create PowerBIReader for this table
power_bi_reader = await PowerBIReader.create(
dataset_id=settings.POWERBI_DATASET_ID,
access_token=self.access_token,
table_name=table_config.powerbi_table_name,
)
# Step 2: Read data from Power BI
df = await power_bi_reader.read_data()
if df.empty:
raise RuntimeError(
f"No data read from Power BI for table '{table_config.name}'."
)
# Step 3: Preprocess the data using this table's steps
df = await self.preprocessor.preprocess(df, steps=table_config.steps)
if df.empty:
raise RuntimeError(
f"No data returned from preprocessing for table '{table_config.name}'."
)
# Step 4: Update the local SQLite database
await self.data_saver.save_table(df, table_config.name, overwrite=True)

View file

@ -1,29 +1,32 @@
version: 1
steps:
- keep:
columns:
[
"Artikelkürzel",
"Artikelnummer",
"Artikelbezeichnung",
"Lieferant",
"Ist-Bestand",
"Einheit",
"EP in CHF",
]
- fillna:
column: "Lieferant"
value: "Unbekannt"
- to_numeric:
column: "EP in CHF"
errors: "coerce"
- dropna:
subset:
[
"Artikelkürzel",
"Artikelnummer",
"Artikelbezeichnung",
"Ist-Bestand",
"Einheit",
"EP in CHF",
]
tables:
- name: "inventory_table"
powerbi_table_name: "InventoryData"
steps:
- keep:
columns:
[
"Artikelkürzel",
"Artikelnummer",
"Artikelbezeichnung",
"Lieferant",
"Ist-Bestand",
"Einheit",
"EP in CHF",
]
- fillna:
column: "Lieferant"
value: "Unbekannt"
- to_numeric:
column: "EP in CHF"
errors: "coerce"
- dropna:
subset:
[
"Artikelkürzel",
"Artikelnummer",
"Artikelbezeichnung",
"Ist-Bestand",
"Einheit",
"EP in CHF",
]

View file

@ -59,11 +59,6 @@ class Settings(BaseSettings):
..., description="Power BI Dataset ID to read data from."
)
# Power BI Table Name.
POWERBI_TABLE_NAME: str = Field(
..., description="Power BI Table name to read data from."
)
# Power BI Tenant ID.
POWERBI_TENANT_ID: str = Field(
..., description="Azure AD Tenant ID for Power BI authentication."

View file

@ -3,14 +3,29 @@
import pytest
from src.dataprocessor.domain.powerbi_reader import PowerBIReader
from src.dataprocessor.domain.preprocessor import Preprocessor
from src.settings import settings
@pytest.mark.asyncio
async def test_read_data_prints_dataframe_info() -> None:
"""Test PowerBIReader.read_data() and print DataFrame info for development."""
# Load preprocessor config to get table configurations
print("\nLoading table configurations...")
preprocessor = await Preprocessor.create(settings.PP_CONFIG_PATH)
table_configs = preprocessor.get_table_configs()
if not table_configs:
pytest.skip("No table configurations found in preprocessing config.")
# Use the first table configuration for testing
table_config = table_configs[0]
print(
f"✓ Using table configuration: {table_config.name} -> {table_config.powerbi_table_name}"
)
# Get access token
print("\nGetting access token...")
print("Getting access token...")
access_token = await PowerBIReader._get_access_token_async(
tenant_id=settings.POWERBI_TENANT_ID,
client_id=settings.POWERBI_CLIENT_ID,
@ -23,9 +38,9 @@ async def test_read_data_prints_dataframe_info() -> None:
reader = await PowerBIReader.create(
dataset_id=settings.POWERBI_DATASET_ID,
access_token=access_token,
table_name=settings.POWERBI_TABLE_NAME,
table_name=table_config.powerbi_table_name,
)
print(f"✓ Reader created for table: {settings.POWERBI_TABLE_NAME}")
print(f"✓ Reader created for table: {table_config.powerbi_table_name}")
# Call read_data() once
print("Fetching data from Power BI...")