gateway/analyze_naming_violations.py

"""
Script to analyze codebase for snake_case naming violations that should be camelStyle.
Excludes routes (decorated endpoint functions) and JSON field names.
"""
import ast
import os
import re
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Tuple
import csv

# Patterns to exclude (external library interfaces, etc.)
EXCLUDE_PATTERNS = [
    r'@.*\.(get|post|put|delete|patch|options|head)',  # FastAPI route decorators
    r'self\.(db|db_|model|orm)',  # Database ORM attributes
    r'\.(objects|query|filter|get|all)',  # ORM methods
    r'(request|response|response_model|status_code)',  # FastAPI params
    r'(snake_case|kebab-case)',  # String literals
]

# External library attribute patterns (should not be changed)
EXTERNAL_LIB_ATTRIBUTES = {
    'pydantic', 'fastapi', 'sqlalchemy', 'psycopg', 'requests',
    'aiohttp', 'azure', 'google', 'openai', 'anthropic', 'reportlab',
    'docx', 'pptx', 'openpyxl', 'json', 'logging', 'datetime', 'typing'
}

def isRouteFile(filePath: str) -> bool:
    """Check if file is a route file"""
    return 'routes' in filePath or 'route' in os.path.basename(filePath).lower()

def shouldExcludeName(name: str, context: str = "") -> bool:
    """Check if a name should be excluded from analysis"""
    # Skip if it's a builtin or external library attribute
    if name.startswith('__') and name.endswith('__'):
        return True

    # Skip if context suggests external library usage
    for pattern in EXCLUDE_PATTERNS:
        if re.search(pattern, context, re.IGNORECASE):
            return True

    return False

def isSnakeCase(name: str) -> bool:
    """Check if a name is snake_case"""
    if not name or name.startswith('_'):
        return False
    # Check if contains underscore and is not all caps
    return '_' in name and not name.isupper()

def analyzeFile(filePath: str) -> Dict[str, List[str]]:
    """Analyze a Python file for naming violations"""
    violations = {
        'functions': [],
        'parameters': [],
        'variables': []
    }

    try:
        with open(filePath, 'r', encoding='utf-8') as f:
            content = f.read()
            tree = ast.parse(content, filename=filePath)
    except (SyntaxError, UnicodeDecodeError):
        return violations

    # Track current context
    currentClass = None
    inRouteDecorator = False

    class NamingAnalyzer(ast.NodeVisitor):
        def __init__(self):
            self.violations = violations
            self.currentClass = None
            self.inRouteDecorator = False
            self.functionDefs = []

        def visit_FunctionDef(self, node):
            # Check if this is a route endpoint (has FastAPI decorator)
            isRouteEndpoint = False
            for decorator in node.decorator_list:
                if isinstance(decorator, ast.Attribute):
                    if decorator.attr in ['get', 'post', 'put', 'delete', 'patch', 'options', 'head']:
                        isRouteEndpoint = True
                        break
                elif isinstance(decorator, ast.Call):
                    if isinstance(decorator.func, ast.Attribute):
                        if decorator.func.attr in ['get', 'post', 'put', 'delete', 'patch', 'options', 'head']:
                            isRouteEndpoint = True
                            break

            # Skip route endpoint function names
            # But we still need to check their parameters and variables
            funcName = node.name
            if not isRouteEndpoint and isSnakeCase(funcName) and not shouldExcludeName(funcName):
                self.violations['functions'].append(f"{funcName} (line {node.lineno})")

            # Analyze parameters
            for arg in node.args.args:
                if arg.arg != 'self' and arg.arg != 'cls':
                    paramName = arg.arg
                    if isSnakeCase(paramName) and not shouldExcludeName(paramName):
                        self.violations['parameters'].append(f"{paramName} in {funcName} (line {node.lineno})")

            # Analyze function body for local variables
            for stmt in node.body:
                self.visit(stmt)

        def visit_ClassDef(self, node):
            oldClass = self.currentClass
            self.currentClass = node.name
            self.generic_visit(node)
            self.currentClass = oldClass

        def visit_Assign(self, node):
            for target in node.targets:
                if isinstance(target, ast.Name):
                    varName = target.id
                    # Skip constants (ALL_CAPS), builtins, and private (_xxx)
                    if varName.isupper() or varName.startswith('_'):
                        continue
                    # Local variables should be camelStyle
                    if isSnakeCase(varName) and not shouldExcludeName(varName):
                        self.violations['variables'].append(f"{varName} (line {node.lineno})")

        def visit_For(self, node):
            if isinstance(node.target, ast.Name):
                varName = node.target.id
                if isSnakeCase(varName) and not shouldExcludeName(varName):
                    self.violations['variables'].append(f"{varName} (line {node.lineno})")
            self.generic_visit(node)

        def visit_With(self, node):
            if node.items:
                for item in node.items:
                    if item.optional_vars:
                        if isinstance(item.optional_vars, ast.Name):
                            varName = item.optional_vars.id
                            if isSnakeCase(varName) and not shouldExcludeName(varName):
                                self.violations['variables'].append(f"{varName} (line {node.lineno})")
            self.generic_visit(node)

    analyzer = NamingAnalyzer()
    analyzer.visit(tree)

    return violations

def analyzeCodebase(rootDir: str = 'gateway') -> Dict[str, Dict[str, int]]:
    """Analyze entire codebase"""
    results = defaultdict(lambda: {
        'functions': 0,
        'parameters': 0,
        'variables': 0,
        'details': {
            'functions': [],
            'parameters': [],
            'variables': []
        }
    })

    # Handle both absolute and relative paths
    rootPath = Path(rootDir)
    if not rootPath.exists():
        # Try relative to current directory
        rootPath = Path('.').resolve() / rootDir
        if not rootPath.exists():
            # Try just current directory if we're already in gateway
            rootPath = Path('.')

    # Find all Python files
    for pyFile in rootPath.rglob('*.py'):
        # Skip route files for function name analysis (but analyze their internals)
        filePath = str(pyFile.relative_to(rootPath))

        # Skip test files and special scripts
        if 'test' in filePath.lower() or 'tool_' in filePath or '__pycache__' in filePath:
            continue

        violations = analyzeFile(str(pyFile))

        # Check if there are any violations
        totalViolations = len(violations['functions']) + len(violations['parameters']) + len(violations['variables'])
        if totalViolations > 0:
            moduleName = filePath.replace('\\', '/')
            results[moduleName]['functions'] = len(violations['functions'])
            results[moduleName]['parameters'] = len(violations['parameters'])
            results[moduleName]['variables'] = len(violations['variables'])
            results[moduleName]['details'] = violations

    return results

def generateCSV(results: Dict[str, Dict[str, int]], outputFile: str = 'naming_violations.csv'):
    """Generate CSV report"""
    with open(outputFile, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Module', 'Function Names', 'Parameter Names', 'Variable Names', 'Total'])

        # Sort by total violations
        sortedResults = sorted(
            results.items(),
            key=lambda x: x[1]['functions'] + x[1]['parameters'] + x[1]['variables'],
            reverse=True
        )

        rowsWritten = 0
        for module, stats in sortedResults:
            total = stats['functions'] + stats['parameters'] + stats['variables']
            if total > 0:
                writer.writerow([
                    module,
                    stats['functions'],
                    stats['parameters'],
                    stats['variables'],
                    total
                ])
                rowsWritten += 1

        if rowsWritten == 0:
            print("WARNING: No rows written to CSV despite finding violations!")

    print(f"CSV report generated: {outputFile}")
    print(f"Total modules analyzed: {len(results)}")

    # Print summary
    totalFuncs = sum(r['functions'] for r in results.values())
    totalParams = sum(r['parameters'] for r in results.values())
    totalVars = sum(r['variables'] for r in results.values())
    print(f"\nSummary:")
    print(f"  Function names: {totalFuncs}")
    print(f"  Parameter names: {totalParams}")
    print(f"  Variable names: {totalVars}")
    print(f"  Total violations: {totalFuncs + totalParams + totalVars}")

if __name__ == '__main__':
    print("Analyzing codebase for naming violations...")
    results = analyzeCodebase('gateway')

    # Write CSV to gateway directory
    outputPath = Path('gateway') / 'naming_violations_report.csv'
    generateCSV(results, str(outputPath))