Source code for chunker.chunker

"""Main chunker module with token-aware chunking capabilities."""

from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING

# Import the core functions from the new module
from .core import chunk_text
from .token.chunker import TreeSitterTokenAwareChunker
from .token.counter import TiktokenCounter

if TYPE_CHECKING:
    from .types import CodeChunk

__all__ = [
    "TreeSitterTokenAwareChunker",
    "chunk_file_with_token_limit",
    "chunk_text_with_token_limit",
    "count_chunk_tokens",
]


[docs] def chunk_text_with_token_limit( text: str, language: str, max_tokens: int, file_path: str = "", model: str = "gpt-4", extract_metadata: bool = True, include_retrieval_metadata: bool = False, ) -> list[CodeChunk]: """Parse text and return chunks that respect token limits. This function chunks code using tree-sitter and ensures no chunk exceeds the specified token limit. Large chunks are automatically split while preserving code structure when possible. Args: text: Source code text to chunk language: Programming language max_tokens: Maximum tokens per chunk file_path: Path to the file (optional) model: Tokenizer model to use (default: "gpt-4") extract_metadata: Whether to extract metadata (default: True) include_retrieval_metadata: Whether to add retrieval-oriented metadata Returns: List of CodeChunk objects with token counts in metadata """ # First get regular chunks chunks = chunk_text( text, language, file_path, extract_metadata, include_retrieval_metadata, ) # Create token-aware chunker token_chunker = TreeSitterTokenAwareChunker() # Add token info and split if needed chunks_with_tokens = token_chunker.add_token_info(chunks, model) # Handle oversized chunks final_chunks = [] for chunk in chunks_with_tokens: token_count = chunk.metadata.get("token_count", 0) if token_count <= max_tokens: final_chunks.append(chunk) else: # Split the oversized chunk split_chunks = token_chunker._split_large_chunk(chunk, max_tokens, model) final_chunks.extend(split_chunks) return final_chunks
def chunk_file_with_token_limit( path: str | Path, language: str, max_tokens: int, model: str = "gpt-4", extract_metadata: bool = True, include_retrieval_metadata: bool = False, ) -> list[CodeChunk]: """Parse file and return chunks that respect token limits. This function chunks a file using tree-sitter and ensures no chunk exceeds the specified token limit. Large chunks are automatically split while preserving code structure when possible. Args: path: Path to the file to chunk language: Programming language max_tokens: Maximum tokens per chunk model: Tokenizer model to use (default: "gpt-4") extract_metadata: Whether to extract metadata (default: True) include_retrieval_metadata: Whether to add retrieval-oriented metadata Returns: List of CodeChunk objects with token counts in metadata """ src = Path(path).read_text(encoding="utf-8") return chunk_text_with_token_limit( src, language, max_tokens, str(path), model, extract_metadata, include_retrieval_metadata, ) def count_chunk_tokens(chunk: CodeChunk, model: str = "gpt-4") -> int: """Count tokens in a code chunk. Args: chunk: The CodeChunk to count tokens for model: Tokenizer model to use (default: "gpt-4") Returns: Number of tokens in the chunk """ counter = TiktokenCounter() return counter.count_tokens(chunk.content, model)