from __future__ import annotations
import json
import logging
import os
import re
import tomllib
from pathlib import Path
from typing import Any, ClassVar
import yaml
from chunker.utils.json import load_json_file
# tomli_w is needed for writing TOML files (tomllib is read-only)
try:
import tomli_w
HAS_TOMLI_W = True
except ImportError:
HAS_TOMLI_W = False
from .languages.base import PluginConfig
logger = logging.getLogger(__name__)
[docs]
class ChunkerConfig:
"""Configuration manager for the chunker system.
Supports environment variable expansion and overrides:
- ${VAR} or ${VAR:default} syntax in config files
- CHUNKER_* environment variables override config values
"""
DEFAULT_CONFIG_FILENAME = "chunker.config"
SUPPORTED_FORMATS: ClassVar[set[str]] = {".toml", ".yaml", ".yml", ".json"}
ENV_PREFIX = "CHUNKER_"
# Match ${VAR} or ${VAR:default}
ENV_VAR_PATTERN = re.compile(r"\$\{([^}]+)\}")
[docs]
def __init__(
self,
config_path: Path | None = None,
use_env_vars: bool = True,
):
self.config_path = config_path
self.data: dict[str, Any] = {}
self.plugin_configs: dict[str, PluginConfig] = {}
self.use_env_vars = use_env_vars
self.plugin_dirs: list[Path] = []
self.enabled_languages: set[str] | None = None
self.default_plugin_config: PluginConfig = PluginConfig()
if config_path:
self.load(config_path)
[docs]
@classmethod
def find_config(cls, start_path: Path | None = None) -> Path | None:
"""Find configuration file starting from the given path."""
if start_path is None:
start_path = Path.cwd()
current = start_path.resolve()
while current != current.parent:
for ext in cls.SUPPORTED_FORMATS:
config_file = current / f"{cls.DEFAULT_CONFIG_FILENAME}{ext}"
if config_file.exists():
return config_file
current = current.parent
home = Path.home()
for ext in cls.SUPPORTED_FORMATS:
config_file = home / ".chunker" / f"config{ext}"
if config_file.exists():
return config_file
return None
[docs]
def load(self, config_path: Path) -> None:
"""Load configuration from file."""
config_path = Path(config_path)
if not config_path.exists():
raise FileNotFoundError(f"Configuration file not found: {config_path}")
ext = config_path.suffix.lower()
try:
if ext == ".toml":
# tomllib requires binary mode
with Path(config_path).open("rb") as f:
self.data = tomllib.load(f)
elif ext in {".yaml", ".yml"}:
with Path(config_path).open(encoding="utf-8") as f:
self.data = yaml.safe_load(f) or {}
elif ext == ".json":
self.data = load_json_file(config_path)
else:
raise ValueError(f"Unsupported config format: {ext}")
self.config_path = config_path
if self.use_env_vars:
self.data = self._expand_env_vars(self.data)
self._parse_config()
if self.use_env_vars:
self._apply_env_overrides()
logger.info("Loaded configuration from: %s", config_path)
except (FileNotFoundError, OSError, SyntaxError) as e:
logger.error("Failed to load config from %s: %s", config_path, e)
raise
[docs]
def save(self, config_path: Path | None = None) -> None:
"""Save configuration to file.
Note: For TOML output, requires the optional 'tomli-w' package.
Install with: pip install tomli-w
"""
if not config_path:
config_path = self.config_path
if not config_path:
raise ValueError("No config path specified")
config_path = Path(config_path)
ext = config_path.suffix.lower()
save_data = self._prepare_save_data()
try:
if ext == ".toml":
if not HAS_TOMLI_W:
raise ImportError(
"Writing TOML files requires 'tomli-w'. "
"Install with: pip install tomli-w",
)
with Path(config_path).open("wb") as f:
tomli_w.dump(save_data, f)
elif ext in {".yaml", ".yml"}:
with Path(config_path).open("w", encoding="utf-8") as f:
yaml.safe_dump(save_data, f, default_flow_style=False)
elif ext == ".json":
with Path(config_path).open("w", encoding="utf-8") as f:
json.dump(save_data, f, indent=2)
else:
raise ValueError(f"Unsupported config format: {ext}")
logger.info("Saved configuration to: %s", config_path)
except (AttributeError, FileNotFoundError, KeyError) as e:
logger.error("Failed to save config to %s: %s", config_path, e)
raise
def _parse_config(self) -> None:
"""Parse loaded configuration data."""
chunker_config = self.data.get("chunker", {})
plugin_dirs = chunker_config.get("plugin_dirs", [])
self.plugin_dirs = [self._resolve_path(p) for p in plugin_dirs]
enabled = chunker_config.get("enabled_languages")
if enabled:
self.enabled_languages = set(enabled)
default_config = chunker_config.get("default_plugin_config", {})
self.default_plugin_config = self._parse_plugin_config(default_config)
languages = self.data.get("languages", {})
for lang, config in languages.items():
self.plugin_configs[lang] = self._parse_plugin_config(config)
@classmethod
def _parse_plugin_config(cls, config_dict: dict[str, Any]) -> PluginConfig:
"""Parse a plugin configuration dictionary."""
enabled = config_dict.get("enabled", True)
chunk_types = config_dict.get("chunk_types")
if chunk_types:
chunk_types = set(chunk_types)
min_chunk_size = config_dict.get("min_chunk_size", 1)
max_chunk_size = config_dict.get("max_chunk_size")
known_fields = {"enabled", "chunk_types", "min_chunk_size", "max_chunk_size"}
custom_options = {
key: value for key, value in config_dict.items() if key not in known_fields
}
return PluginConfig(
enabled=enabled,
chunk_types=chunk_types,
min_chunk_size=min_chunk_size,
max_chunk_size=max_chunk_size,
custom_options=custom_options,
)
def _prepare_save_data(self) -> dict[str, Any]:
"""Prepare configuration data for saving."""
data = {}
chunker = {}
if self.plugin_dirs:
chunker["plugin_dirs"] = [str(p) for p in self.plugin_dirs]
if self.enabled_languages:
chunker["enabled_languages"] = sorted(self.enabled_languages)
if self.default_plugin_config != PluginConfig():
chunker["default_plugin_config"] = self._plugin_config_to_dict(
self.default_plugin_config,
)
if chunker:
data["chunker"] = chunker
if self.plugin_configs:
languages = {}
for lang, config in sorted(self.plugin_configs.items()):
languages[lang] = self._plugin_config_to_dict(config)
data["languages"] = languages
return data
@staticmethod
def _plugin_config_to_dict(config: PluginConfig) -> dict[str, Any]:
"""Convert PluginConfig to dictionary."""
result = {}
if not config.enabled:
result["enabled"] = False
if config.chunk_types:
result["chunk_types"] = sorted(config.chunk_types)
if config.min_chunk_size != 1:
result["min_chunk_size"] = config.min_chunk_size
if config.max_chunk_size:
result["max_chunk_size"] = config.max_chunk_size
result.update(config.custom_options)
return result
def _resolve_path(self, path_str: str) -> Path:
"""Resolve a path string relative to config file location."""
path = Path(path_str)
if path_str.startswith("~"):
return path.expanduser()
if path.is_absolute():
return path
if self.config_path:
return (self.config_path.parent / path).resolve()
return path.resolve()
[docs]
def get_plugin_config(self, language: str) -> PluginConfig:
"""Get configuration for a specific language plugin."""
if self.enabled_languages and language not in self.enabled_languages:
return PluginConfig(enabled=False)
return self.plugin_configs.get(language, self.default_plugin_config)
[docs]
def set_plugin_config(self, language: str, config: PluginConfig) -> None:
"""Set configuration for a specific language plugin."""
self.plugin_configs[language] = config
[docs]
def add_plugin_directory(self, directory: Path) -> None:
"""Add a plugin directory."""
directory = Path(directory).resolve()
if directory not in self.plugin_dirs:
self.plugin_dirs.append(directory)
[docs]
def remove_plugin_directory(self, directory: Path) -> None:
"""Remove a plugin directory."""
directory = Path(directory).resolve()
if directory in self.plugin_dirs:
self.plugin_dirs.remove(directory)
[docs]
@classmethod
def create_example_config(cls, config_path: Path) -> None:
"""Create an example configuration file."""
example_data = {
"chunker": {
"plugin_dirs": ["./plugins", "~/.chunker/plugins"],
"enabled_languages": ["python", "rust", "javascript", "c", "cpp"],
"default_plugin_config": {"min_chunk_size": 3, "max_chunk_size": 500},
},
"languages": {
"python": {
"enabled": True,
"chunk_types": [
"function_definition",
"class_definition",
"async_function_definition",
],
"include_docstrings": True,
},
"rust": {
"enabled": True,
"chunk_types": [
"function_item",
"impl_item",
"struct_item",
"enum_item",
"trait_item",
],
},
"javascript": {
"enabled": True,
"chunk_types": [
"function_declaration",
"method_definition",
"class_declaration",
"arrow_function",
],
"include_jsx": True,
},
},
}
config = cls()
config.data = example_data
config.save(config_path)
def _expand_env_vars(self, data: Any) -> Any:
"""Recursively expand environment variables in configuration data.
Supports ${VAR} and ${VAR:default} syntax.
"""
if isinstance(data, str):
def replacer(match):
var_expr = match.group(1)
if ":" in var_expr:
var_name, default = var_expr.split(":", 1)
else:
var_name, default = var_expr, None
value = os.environ.get(var_name)
if value is None:
if default is not None:
return default
logger.warning("Environment variable '%s' not found", var_name)
return match.group(0) # Keep original
return value
return self.ENV_VAR_PATTERN.sub(replacer, data)
if isinstance(data, dict):
return {key: self._expand_env_vars(value) for key, value in data.items()}
if isinstance(data, list):
return [self._expand_env_vars(item) for item in data]
return data
def _apply_env_overrides(self) -> None:
"""Apply environment variable overrides to configuration.
Environment variables with CHUNKER_ prefix override config values.
Examples:
- CHUNKER_ENABLED_LANGUAGES=python,rust
- CHUNKER_PLUGIN_DIRS=/path/one,/path/two
- CHUNKER_LANGUAGES_PYTHON_ENABLED=false
"""
for env_var, value in os.environ.items():
if not env_var.startswith(self.ENV_PREFIX):
continue
config_path = env_var[len(self.ENV_PREFIX) :].lower()
path_parts = config_path.split("_")
if config_path == "enabled_languages":
self.enabled_languages = set(value.split(","))
logger.info(
"Set enabled_languages from env: %s",
self.enabled_languages,
)
continue
if config_path == "plugin_dirs":
self.plugin_dirs = [Path(p.strip()) for p in value.split(",")]
logger.info("Set plugin_dirs from env: %s", self.plugin_dirs)
continue
if len(path_parts) >= 2 and path_parts[0] == "languages":
if len(path_parts) >= 3:
lang = path_parts[1]
setting = "_".join(path_parts[2:])
if lang not in self.plugin_configs:
self.plugin_configs[lang] = PluginConfig()
if setting == "enabled":
self.plugin_configs[lang].enabled = value.lower() == "true"
elif setting == "min_chunk_size":
self.plugin_configs[lang].min_chunk_size = int(value)
elif setting == "max_chunk_size":
self.plugin_configs[lang].max_chunk_size = int(value)
elif setting == "chunk_types":
self.plugin_configs[lang].chunk_types = set(value.split(","))
else:
self.plugin_configs[lang].custom_options[setting] = value
logger.info("Set %s.%s from env: %s", lang, setting, value)
elif (
len(path_parts) >= 2
and path_parts[0] == "default"
and path_parts[1] == "plugin"
and path_parts[2] == "config"
):
setting = "_".join(path_parts[3:])
if setting == "min_chunk_size":
self.default_plugin_config.min_chunk_size = int(value)
elif setting == "max_chunk_size":
self.default_plugin_config.max_chunk_size = int(value)
logger.info("Set default_plugin_config.%s from env: %s", setting, value)
[docs]
@classmethod
def get_env_var_info(cls) -> dict[str, str]:
"""Get information about supported environment variables."""
return {
f"{cls.ENV_PREFIX}ENABLED_LANGUAGES": "Comma-separated list of enabled languages",
f"{cls.ENV_PREFIX}PLUGIN_DIRS": "Comma-separated list of plugin directories",
f"{cls.ENV_PREFIX}LANGUAGES_<LANG>_ENABLED": "Enable/disable specific language (true/false)",
f"{cls.ENV_PREFIX}LANGUAGES_<LANG>_MIN_CHUNK_SIZE": "Minimum chunk size for language",
f"{cls.ENV_PREFIX}LANGUAGES_<LANG>_MAX_CHUNK_SIZE": "Maximum chunk size for language",
f"{cls.ENV_PREFIX}LANGUAGES_<LANG>_CHUNK_TYPES": "Comma-separated list of chunk types",
f"{cls.ENV_PREFIX}DEFAULT_PLUGIN_CONFIG_MIN_CHUNK_SIZE": "Default minimum chunk size",
f"{cls.ENV_PREFIX}DEFAULT_PLUGIN_CONFIG_MAX_CHUNK_SIZE": "Default maximum chunk size",
}