Source code for chunker.types
"""Common types used across the chunker modules."""
from __future__ import annotations
import hashlib
from dataclasses import dataclass, field
from typing import Any
__all__ = [
"CodeChunk",
"compute_definition_id",
"compute_file_id",
"compute_node_id",
"compute_symbol_id",
"compute_text_hash16",
]
def compute_text_hash16(text: str) -> str:
return hashlib.sha1(text.encode("utf-8")).hexdigest()[:16]
def compute_file_id(file_path: str) -> str:
seed = f"file:{file_path}".encode()
return hashlib.sha1(seed).hexdigest()
def compute_node_id(
file_path: str,
language: str,
parent_route: list[str],
content: str,
) -> str:
route = "/".join(parent_route or [])
text_hash16 = compute_text_hash16(content or "")
to_hash = f"{file_path}|{language}|{route}|{text_hash16}".encode()
return hashlib.sha1(to_hash).hexdigest()
def compute_symbol_id(language: str, file_path: str, symbol_name: str) -> str:
seed = f"sym:{language}:{file_path}:{symbol_name}".encode()
return hashlib.sha1(seed).hexdigest()
def compute_definition_id(
file_path: str,
language: str,
qualified_route: list[str],
) -> str:
"""Compute a content-insensitive stable ID for a definition.
Unlike node_id/chunk_id which include a content hash, definition_id is
computed purely from structural/positional information:
- file_path: The source file
- language: The programming language
- qualified_route: Hierarchical path with names, e.g. ["class_definition:MyClass", "method_definition:foo"]
This ID remains stable when the definition's body changes but changes when:
- The definition is moved to a different structural location
- The definition is renamed
- The file path changes
For anonymous definitions, the implementation falls back to a positional
format like "function:anon@42", where 42 is the start line number.
"""
route = "/".join(qualified_route or [])
to_hash = f"def:{file_path}|{language}|{route}".encode()
return hashlib.sha1(to_hash).hexdigest()
[docs]
@dataclass
class CodeChunk:
language: str
file_path: str
node_type: str
start_line: int
end_line: int
byte_start: int
byte_end: int
parent_context: str
content: str
chunk_id: str = ""
parent_chunk_id: str | None = None
references: list[str] = field(default_factory=list)
dependencies: list[str] = field(default_factory=list)
metadata: dict[str, Any] = field(default_factory=dict)
# New stable identity and hierarchy fields
node_id: str = ""
file_id: str = ""
symbol_id: str | None = None
parent_route: list[str] = field(default_factory=list)
# Content-insensitive identity for tracking definitions across code changes
qualified_route: list[str] = field(default_factory=list)
definition_id: str = ""
[docs]
def generate_id(self) -> str:
"""Generate a stable ID using file/language/route/text hash."""
return compute_node_id(
self.file_path,
self.language,
self.parent_route,
self.content,
)
def __post_init__(self):
if not self.node_id:
self.node_id = self.generate_id()
if not self.chunk_id:
# Use full 40-char SHA1 for chunk_id to match tests
self.chunk_id = self.generate_id()
if not self.file_id and self.file_path:
self.file_id = compute_file_id(self.file_path)
# Compute definition_id from qualified_route if not already set
if not self.definition_id and self.qualified_route and self.file_path:
self.definition_id = compute_definition_id(
self.file_path,
self.language,
self.qualified_route,
)
# Do not auto-inject span/route into metadata; tests expect control over metadata presence
def __eq__(self, other: object) -> bool:
if not isinstance(other, CodeChunk):
return NotImplemented
# Chunks with same id but different content should not be equal
return self.chunk_id == other.chunk_id and self.content == other.content
def __hash__(self) -> int:
# Hash by stable identifier to allow set/dict usage
return hash(self.chunk_id)