diff --git a/CLI.md b/CLI.md new file mode 100644 index 00000000..c3128f0f --- /dev/null +++ b/CLI.md @@ -0,0 +1,211 @@ +# tiktoken CLI + +Command-line interface for counting tokens in files and directories. + +## Installation + +After installing tiktoken, the `tiktoken` command will be available: + +```bash +pip install tiktoken +``` + +## Usage + +### Basic Token Counting + +Count tokens in a single file: +```bash +tiktoken count file.txt +``` + +Output: +``` +42 +``` + +### Using Specific Models + +Count tokens using a specific model's encoding: +```bash +tiktoken count --model gpt-4o document.txt +tiktoken count --model gpt-4-turbo code.py +``` + +### Directory Operations + +Count tokens in all files in a directory: +```bash +tiktoken count --recursive ./src/ +``` + +Use glob patterns to filter files: +```bash +tiktoken count --glob "*.py" ./project/ +tiktoken count --recursive --glob "*.md" ./docs/ +``` + +### Output Formats + +#### JSON Output +```bash +tiktoken count --json file.txt +``` + +Output: +```json +{ + "summary": { + "total_files": 1, + "total_tokens": 1250, + "total_characters": 5432, + "average_tokens_per_file": 1250 + }, + "files": [ + { + "file": "file.txt", + "tokens": 1250, + "chars": 5432, + "lines": 85 + } + ] +} +``` + +#### CSV Output +```bash +tiktoken count --csv ./src/ +``` + +Output: +```csv +file,tokens,characters,lines +src/main.py,450,2100,65 +src/utils.py,320,1540,48 +src/config.py,180,850,28 +``` + +#### Per-File Breakdown +```bash +tiktoken count --per-file ./src/ +``` + +Output: +``` +src/main.py: 450 tokens +src/utils.py: 320 tokens +src/config.py: 180 tokens + +Total files: 3 +Total tokens: 950 +Total characters: 4490 +Average tokens per file: 316 +``` + +## Use Cases + +### Estimating Context Window Usage + +Check if your codebase fits in a model's context window: + +```bash +# GPT-4 Turbo has 128k token context +tiktoken count --model gpt-4-turbo --recursive ./my-project/ + +# Output: Total tokens: 45,230 +# Result: Fits comfortably in context window +``` + +### Cost Estimation + +Estimate API costs by counting tokens: + +```bash +tiktoken count --json --recursive ./documents/ > token_report.json +# Use the token count to calculate costs based on model pricing +``` + +### CI/CD Integration + +Add token counting to your CI pipeline: + +```bash +#!/bin/bash +TOKEN_COUNT=$(tiktoken count --recursive ./src/ | grep "Total tokens" | awk '{print $3}' | tr -d ',') +MAX_TOKENS=50000 + +if [ $TOKEN_COUNT -gt $MAX_TOKENS ]; then + echo "Error: Codebase exceeds $MAX_TOKENS tokens (found: $TOKEN_COUNT)" + exit 1 +fi +``` + +### Documentation Analysis + +Analyze documentation token usage: + +```bash +tiktoken count --recursive --glob "*.md" --per-file ./docs/ | tee docs_tokens.txt +``` + +## Command Reference + +### Arguments + +- `paths`: One or more files or directories to process + +### Options + +- `-m, --model MODEL`: Use encoding for specific OpenAI model (e.g., `gpt-4o`, `gpt-4-turbo`) +- `-e, --encoding ENCODING`: Specify encoding directly (default: `o200k_base`) +- `-r, --recursive`: Process directories recursively +- `-g, --glob PATTERN`: Filter files using glob pattern (e.g., `"*.py"`) +- `--json`: Output results as JSON +- `--csv`: Output results as CSV +- `--summary`: Show summary statistics +- `--per-file`: Show per-file token counts + +## Examples + +### Count tokens in Python files +```bash +tiktoken count --glob "*.py" --recursive ./project/ +``` + +### Generate JSON report for multiple files +```bash +tiktoken count --json file1.txt file2.txt file3.txt > report.json +``` + +### Check specific model compatibility +```bash +tiktoken count --model gpt-4o --summary ./codebase/ +``` + +### Export to CSV for analysis +```bash +tiktoken count --csv --recursive ./src/ > tokens.csv +``` + +## Tips + +1. **Performance**: The CLI processes files quickly thanks to tiktoken's fast Rust implementation +2. **Binary Files**: Binary files are automatically skipped +3. **Large Directories**: Use `--glob` to filter files and speed up processing +4. **Shell Integration**: Pipe output to other tools for further processing + +## Troubleshooting + +**Error: "No files found to process"** +- Check your glob pattern syntax +- Ensure files exist in the specified path +- Use `--recursive` for subdirectories + +**Error: "Unknown model 'xyz'"** +- The model name might be incorrect +- Use `--encoding` instead to specify encoding directly +- Check [OpenAI's model documentation](https://platform.openai.com/docs/models) for valid model names + +**Binary file warnings** +- The CLI automatically skips binary files +- This is expected behavior and can be ignored diff --git a/setup.py b/setup.py index 2a3ebbf1..5c1df32d 100644 --- a/setup.py +++ b/setup.py @@ -15,5 +15,10 @@ ], package_data={"tiktoken": ["py.typed"]}, packages=["tiktoken", "tiktoken_ext"], + entry_points={ + "console_scripts": [ + "tiktoken=tiktoken.cli:main", + ], + }, zip_safe=False, ) diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 00000000..9a4f8aff --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,119 @@ +""" +Test suite for tiktoken CLI. + +Run with: pytest tests/test_cli.py +""" + +import os +import sys +import tempfile +from pathlib import Path + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from tiktoken.cli import ( + count_tokens_in_text, + count_tokens_in_file, + collect_files, + format_output_json, + format_output_csv, +) + + +def test_count_tokens_in_text(): + """Test basic token counting.""" + text = "Hello, world!" + count = count_tokens_in_text(text, "o200k_base") + assert count > 0 + assert isinstance(count, int) + + +def test_count_tokens_in_file(): + """Test counting tokens in a file.""" + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: + f.write("This is a test file for tiktoken CLI.") + temp_path = f.name + + try: + result = count_tokens_in_file(Path(temp_path), "o200k_base") + assert result is not None + assert 'tokens' in result + assert 'chars' in result + assert 'lines' in result + assert result['tokens'] > 0 + finally: + os.unlink(temp_path) + + +def test_collect_files_single_file(): + """Test collecting a single file.""" + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: + temp_path = f.name + + try: + files = collect_files([temp_path], False, None) + assert len(files) == 1 + assert files[0] == Path(temp_path) + finally: + os.unlink(temp_path) + + +def test_collect_files_directory(): + """Test collecting files from a directory.""" + with tempfile.TemporaryDirectory() as tmpdir: + # Create test files + test_dir = Path(tmpdir) + (test_dir / "file1.txt").write_text("content 1") + (test_dir / "file2.txt").write_text("content 2") + + files = collect_files([tmpdir], False, None) + assert len(files) == 2 + + +def test_format_output_json(): + """Test JSON output formatting.""" + results = [ + {'file': 'test.txt', 'tokens': 100, 'chars': 500, 'lines': 10} + ] + + output = format_output_json(results) + assert 'summary' in output + assert 'total_tokens' in output + assert '100' in output + + +def test_format_output_csv(): + """Test CSV output formatting.""" + results = [ + {'file': 'test.txt', 'tokens': 100, 'chars': 500, 'lines': 10} + ] + + output = format_output_csv(results) + assert 'file,tokens,characters,lines' in output + assert 'test.txt,100,500,10' in output + + +if __name__ == '__main__': + # Run basic tests + print("Running tiktoken CLI tests...") + + test_count_tokens_in_text() + print("✓ test_count_tokens_in_text") + + test_count_tokens_in_file() + print("✓ test_count_tokens_in_file") + + test_collect_files_single_file() + print("✓ test_collect_files_single_file") + + test_collect_files_directory() + print("✓ test_collect_files_directory") + + test_format_output_json() + print("✓ test_format_output_json") + + test_format_output_csv() + print("✓ test_format_output_csv") + + print("\n✅ All tests passed!") diff --git a/tiktoken/cli.py b/tiktoken/cli.py new file mode 100644 index 00000000..5f6bb9ef --- /dev/null +++ b/tiktoken/cli.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +""" +Command-line interface for tiktoken. + +This CLI tool allows you to count tokens in files and directories directly +from the command line, which is useful for: +- Estimating context window usage for codebases +- Quick token counting without writing Python code +- Batch processing multiple files +- Integration with shell scripts and CI/CD pipelines + +Usage: + tiktoken count file.txt + tiktoken count --model gpt-4o file.txt + tiktoken count --recursive ./src/ + tiktoken count --json file.txt +""" + +import argparse +import sys +import json +import glob as glob_module +from pathlib import Path +from typing import List, Dict, Any, Optional + +import tiktoken + + +def count_tokens_in_text(text: str, encoding_name: str) -> int: + """Count tokens in a text string.""" + enc = tiktoken.get_encoding(encoding_name) + return len(enc.encode(text)) + + +def count_tokens_in_file(file_path: Path, encoding_name: str) -> Optional[Dict[str, Any]]: + """Count tokens in a single file.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + token_count = count_tokens_in_text(content, encoding_name) + + return { + 'file': str(file_path), + 'tokens': token_count, + 'chars': len(content), + 'lines': content.count('\n') + 1 + } + except UnicodeDecodeError: + return None # Skip binary files + except Exception as e: + print(f"Error reading {file_path}: {e}", file=sys.stderr) + return None + + +def get_encoding_for_model(model: str) -> str: + """Get encoding name for a model.""" + try: + enc = tiktoken.encoding_for_model(model) + return enc.name + except KeyError: + print(f"Warning: Unknown model '{model}', using o200k_base encoding", file=sys.stderr) + return "o200k_base" + + +def collect_files(paths: List[str], recursive: bool, pattern: Optional[str]) -> List[Path]: + """Collect all files to process.""" + files = [] + + for path_str in paths: + path = Path(path_str) + + if path.is_file(): + files.append(path) + elif path.is_dir(): + if recursive: + if pattern: + files.extend(path.rglob(pattern)) + else: + files.extend(p for p in path.rglob('*') if p.is_file()) + else: + if pattern: + files.extend(path.glob(pattern)) + else: + files.extend(p for p in path.glob('*') if p.is_file()) + else: + # Try as glob pattern + matched = list(Path('.').glob(path_str)) + files.extend(p for p in matched if p.is_file()) + + return files + + +def format_output_text(results: List[Dict[str, Any]], summary: bool, per_file: bool) -> str: + """Format output as plain text.""" + output = [] + + if per_file and len(results) > 1: + # Per-file breakdown + for result in results: + output.append(f"{result['file']}: {result['tokens']:,} tokens") + output.append("") # Blank line before summary + + if summary or len(results) > 1: + # Summary statistics + total_tokens = sum(r['tokens'] for r in results) + total_chars = sum(r['chars'] for r in results) + total_files = len(results) + + output.append(f"Total files: {total_files}") + output.append(f"Total tokens: {total_tokens:,}") + output.append(f"Total characters: {total_chars:,}") + + if total_files > 0: + output.append(f"Average tokens per file: {total_tokens // total_files:,}") + elif len(results) == 1: + # Single file - just show token count + output.append(f"{results[0]['tokens']:,}") + + return '\n'.join(output) + + +def format_output_json(results: List[Dict[str, Any]]) -> str: + """Format output as JSON.""" + total_tokens = sum(r['tokens'] for r in results) + total_chars = sum(r['chars'] for r in results) + + output = { + 'summary': { + 'total_files': len(results), + 'total_tokens': total_tokens, + 'total_characters': total_chars, + 'average_tokens_per_file': total_tokens // len(results) if results else 0 + }, + 'files': results + } + + return json.dumps(output, indent=2) + + +def format_output_csv(results: List[Dict[str, Any]]) -> str: + """Format output as CSV.""" + lines = ["file,tokens,characters,lines"] + for result in results: + lines.append(f"{result['file']},{result['tokens']},{result['chars']},{result['lines']}") + return '\n'.join(lines) + + +def main(): + """Main entry point for the CLI.""" + parser = argparse.ArgumentParser( + prog='tiktoken', + description='Count tokens in files using tiktoken', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s count file.txt # Count tokens in a file + %(prog)s count --model gpt-4o file.txt # Use specific model encoding + %(prog)s count -r ./src/ # Count tokens in all files recursively + %(prog)s count --glob "*.py" ./project/ # Count tokens in Python files + %(prog)s count --json file.txt # Output as JSON + %(prog)s count --per-file ./src/ # Show per-file breakdown +""" + ) + + parser.add_argument( + 'command', + choices=['count'], + help='Command to execute (currently only "count" is supported)' + ) + + parser.add_argument( + 'paths', + nargs='+', + help='Files or directories to count tokens in' + ) + + parser.add_argument( + '-m', '--model', + default=None, + help='OpenAI model to use for encoding (e.g., gpt-4o, gpt-4-turbo)' + ) + + parser.add_argument( + '-e', '--encoding', + default='o200k_base', + help='Encoding to use (default: o200k_base)' + ) + + parser.add_argument( + '-r', '--recursive', + action='store_true', + help='Process directories recursively' + ) + + parser.add_argument( + '-g', '--glob', + default=None, + help='Glob pattern to filter files (e.g., "*.py")' + ) + + parser.add_argument( + '--json', + action='store_true', + help='Output results as JSON' + ) + + parser.add_argument( + '--csv', + action='store_true', + help='Output results as CSV' + ) + + parser.add_argument( + '--summary', + action='store_true', + help='Show summary statistics' + ) + + parser.add_argument( + '--per-file', + action='store_true', + help='Show per-file token counts' + ) + + args = parser.parse_args() + + # Determine encoding to use + if args.model: + encoding_name = get_encoding_for_model(args.model) + else: + encoding_name = args.encoding + + # Collect files to process + files = collect_files(args.paths, args.recursive, args.glob) + + if not files: + print("No files found to process", file=sys.stderr) + return 1 + + # Process files + results = [] + for file_path in files: + result = count_tokens_in_file(file_path, encoding_name) + if result: + results.append(result) + + if not results: + print("No files could be processed", file=sys.stderr) + return 1 + + # Format and output results + if args.json: + print(format_output_json(results)) + elif args.csv: + print(format_output_csv(results)) + else: + print(format_output_text(results, args.summary, args.per_file)) + + return 0 + + +if __name__ == '__main__': + sys.exit(main())