Source code for chunker.chunker
"""Main chunker module with token-aware chunking capabilities."""
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING
# Import the core functions from the new module
from .core import chunk_text
from .token.chunker import TreeSitterTokenAwareChunker
from .token.counter import TiktokenCounter
if TYPE_CHECKING:
from .types import CodeChunk
__all__ = [
"TreeSitterTokenAwareChunker",
"chunk_file_with_token_limit",
"chunk_text_with_token_limit",
"count_chunk_tokens",
]
[docs]
def chunk_text_with_token_limit(
text: str,
language: str,
max_tokens: int,
file_path: str = "",
model: str = "gpt-4",
extract_metadata: bool = True,
include_retrieval_metadata: bool = False,
) -> list[CodeChunk]:
"""Parse text and return chunks that respect token limits.
This function chunks code using tree-sitter and ensures no chunk exceeds
the specified token limit. Large chunks are automatically split while
preserving code structure when possible.
Args:
text: Source code text to chunk
language: Programming language
max_tokens: Maximum tokens per chunk
file_path: Path to the file (optional)
model: Tokenizer model to use (default: "gpt-4")
extract_metadata: Whether to extract metadata (default: True)
include_retrieval_metadata: Whether to add retrieval-oriented metadata
Returns:
List of CodeChunk objects with token counts in metadata
"""
# First get regular chunks
chunks = chunk_text(
text,
language,
file_path,
extract_metadata,
include_retrieval_metadata,
)
# Create token-aware chunker
token_chunker = TreeSitterTokenAwareChunker()
# Add token info and split if needed
chunks_with_tokens = token_chunker.add_token_info(chunks, model)
# Handle oversized chunks
final_chunks = []
for chunk in chunks_with_tokens:
token_count = chunk.metadata.get("token_count", 0)
if token_count <= max_tokens:
final_chunks.append(chunk)
else:
# Split the oversized chunk
split_chunks = token_chunker._split_large_chunk(chunk, max_tokens, model)
final_chunks.extend(split_chunks)
return final_chunks
def chunk_file_with_token_limit(
path: str | Path,
language: str,
max_tokens: int,
model: str = "gpt-4",
extract_metadata: bool = True,
include_retrieval_metadata: bool = False,
) -> list[CodeChunk]:
"""Parse file and return chunks that respect token limits.
This function chunks a file using tree-sitter and ensures no chunk exceeds
the specified token limit. Large chunks are automatically split while
preserving code structure when possible.
Args:
path: Path to the file to chunk
language: Programming language
max_tokens: Maximum tokens per chunk
model: Tokenizer model to use (default: "gpt-4")
extract_metadata: Whether to extract metadata (default: True)
include_retrieval_metadata: Whether to add retrieval-oriented metadata
Returns:
List of CodeChunk objects with token counts in metadata
"""
src = Path(path).read_text(encoding="utf-8")
return chunk_text_with_token_limit(
src,
language,
max_tokens,
str(path),
model,
extract_metadata,
include_retrieval_metadata,
)
def count_chunk_tokens(chunk: CodeChunk, model: str = "gpt-4") -> int:
"""Count tokens in a code chunk.
Args:
chunk: The CodeChunk to count tokens for
model: Tokenizer model to use (default: "gpt-4")
Returns:
Number of tokens in the chunk
"""
counter = TiktokenCounter()
return counter.count_tokens(chunk.content, model)