Add --cxml flag (#16)

Refs #15

---------

Co-authored-by: Simon Willison <swillison@gmail.com>
This commit is contained in:
Lex Herbert
2024-09-08 22:19:59 -07:00
committed by GitHub
parent 4810ef7880
commit db4a164fec
4 changed files with 144 additions and 18 deletions

1
.gitignore vendored
View File

@@ -7,3 +7,4 @@ venv
.pytest_cache .pytest_cache
*.egg-info *.egg-info
.DS_Store .DS_Store
build/

View File

@@ -12,6 +12,7 @@ For background on this project see [Building files-to-prompt entirely using Clau
## Installation ## Installation
Install this tool using `pip`: Install this tool using `pip`:
```bash ```bash
pip install files-to-prompt pip install files-to-prompt
``` ```
@@ -29,11 +30,13 @@ This will output the contents of every file, with each file preceded by its rela
### Options ### Options
- `--include-hidden`: Include files and folders starting with `.` (hidden files and directories). - `--include-hidden`: Include files and folders starting with `.` (hidden files and directories).
```bash ```bash
files-to-prompt path/to/directory --include-hidden files-to-prompt path/to/directory --include-hidden
``` ```
- `--ignore-gitignore`: Ignore `.gitignore` files and include all files. - `--ignore-gitignore`: Ignore `.gitignore` files and include all files.
```bash ```bash
files-to-prompt path/to/directory --ignore-gitignore files-to-prompt path/to/directory --ignore-gitignore
``` ```
@@ -101,6 +104,25 @@ Contents of file3.txt
--- ---
``` ```
### XML Output
Anthropic has provided [specific guidelines](https://docs.anthropic.com/claude/docs/long-context-window-tips) for optimally structuring prompts to take advantage of Claude's extended context window.
To structure the output in this way, use the optional `--cxml` flag, which will produce output like this:
```xml
<documents>
<document path="my_directory/file1.txt">
Contents of file1.txt
</document>
<document path="my_directory/file2.txt">
Contents of file2.txt
</document>
...
</documents>
```
## Development ## Development
To contribute to this tool, first checkout the code. Then create a new virtual environment: To contribute to this tool, first checkout the code. Then create a new virtual environment:
@@ -118,6 +140,7 @@ pip install -e '.[test]'
``` ```
To run the tests: To run the tests:
```bash ```bash
pytest pytest
``` ```

View File

@@ -1,7 +1,8 @@
import os import os
import click
from fnmatch import fnmatch from fnmatch import fnmatch
import click
def should_ignore(path, gitignore_rules): def should_ignore(path, gitignore_rules):
for rule in gitignore_rules: for rule in gitignore_rules:
@@ -22,18 +23,39 @@ def read_gitignore(path):
return [] return []
def print_path(path, content, xml):
if xml:
print_as_xml(path, content)
else:
print_default(path, content)
def print_default(path, content):
click.echo(path)
click.echo("---")
click.echo(content)
click.echo()
click.echo("---")
def print_as_xml(path, content):
click.echo(f'<document path="{path}">')
click.echo(content)
click.echo("</document>")
def process_path( def process_path(
path, include_hidden, ignore_gitignore, gitignore_rules, ignore_patterns path,
include_hidden,
ignore_gitignore,
gitignore_rules,
ignore_patterns,
claude_xml,
): ):
if os.path.isfile(path): if os.path.isfile(path):
try: try:
with open(path, "r") as f: with open(path, "r") as f:
file_contents = f.read() print_path(path, f.read(), claude_xml)
click.echo(path)
click.echo("---")
click.echo(file_contents)
click.echo()
click.echo("---")
except UnicodeDecodeError: except UnicodeDecodeError:
warning_message = f"Warning: Skipping file {path} due to UnicodeDecodeError" warning_message = f"Warning: Skipping file {path} due to UnicodeDecodeError"
click.echo(click.style(warning_message, fg="red"), err=True) click.echo(click.style(warning_message, fg="red"), err=True)
@@ -63,17 +85,11 @@ def process_path(
if not any(fnmatch(f, pattern) for pattern in ignore_patterns) if not any(fnmatch(f, pattern) for pattern in ignore_patterns)
] ]
for file in files: for file in sorted(files):
file_path = os.path.join(root, file) file_path = os.path.join(root, file)
try: try:
with open(file_path, "r") as f: with open(file_path, "r") as f:
file_contents = f.read() print_path(file_path, f.read(), claude_xml)
click.echo(file_path)
click.echo("---")
click.echo(file_contents)
click.echo()
click.echo("---")
except UnicodeDecodeError: except UnicodeDecodeError:
warning_message = ( warning_message = (
f"Warning: Skipping file {file_path} due to UnicodeDecodeError" f"Warning: Skipping file {file_path} due to UnicodeDecodeError"
@@ -100,8 +116,15 @@ def process_path(
default=[], default=[],
help="List of patterns to ignore", help="List of patterns to ignore",
) )
@click.option(
"claude_xml",
"-c",
"--cxml",
is_flag=True,
help="Output in XML-ish format suitable for Claude's long context window.",
)
@click.version_option() @click.version_option()
def cli(paths, include_hidden, ignore_gitignore, ignore_patterns): def cli(paths, include_hidden, ignore_gitignore, ignore_patterns, claude_xml):
""" """
Takes one or more paths to files or directories and outputs every file, Takes one or more paths to files or directories and outputs every file,
recursively, each one preceded with its filename like this: recursively, each one preceded with its filename like this:
@@ -114,6 +137,19 @@ def cli(paths, include_hidden, ignore_gitignore, ignore_patterns):
path/to/file2.py path/to/file2.py
--- ---
... ...
If the `--cxml` flag is provided, the output will be structured as follows:
<documents>
<document path="path/to/file1.txt">
Contents of file1.txt
</document>
<document path="path/to/file2.txt">
Contents of file2.txt
</document>
...
</documents>
""" """
gitignore_rules = [] gitignore_rules = []
for path in paths: for path in paths:
@@ -121,6 +157,17 @@ def cli(paths, include_hidden, ignore_gitignore, ignore_patterns):
raise click.BadArgumentUsage(f"Path does not exist: {path}") raise click.BadArgumentUsage(f"Path does not exist: {path}")
if not ignore_gitignore: if not ignore_gitignore:
gitignore_rules.extend(read_gitignore(os.path.dirname(path))) gitignore_rules.extend(read_gitignore(os.path.dirname(path)))
if claude_xml and path == paths[0]:
click.echo("<documents>")
process_path( process_path(
path, include_hidden, ignore_gitignore, gitignore_rules, ignore_patterns path,
include_hidden,
ignore_gitignore,
gitignore_rules,
ignore_patterns,
claude_xml,
) )
if claude_xml:
click.echo("</documents>")

View File

@@ -1,5 +1,7 @@
import os import os
from click.testing import CliRunner from click.testing import CliRunner
from files_to_prompt.cli import cli from files_to_prompt.cli import cli
@@ -186,3 +188,56 @@ def test_binary_file_warning(tmpdir):
"Warning: Skipping file test_dir/binary_file.bin due to UnicodeDecodeError" "Warning: Skipping file test_dir/binary_file.bin due to UnicodeDecodeError"
in stderr in stderr
) )
def test_xml_format_dir(tmpdir):
runner = CliRunner()
with tmpdir.as_cwd():
os.makedirs("test_dir")
with open("test_dir/file1.txt", "w") as f:
f.write("Contents of file1")
with open("test_dir/file2.txt", "w") as f:
f.write("Contents of file2")
result = runner.invoke(cli, ["test_dir", "--cxml"])
assert result.exit_code == 0
actual = result.output
expected = """
<documents>
<document path="test_dir/file1.txt">
Contents of file1
</document>
<document path="test_dir/file2.txt">
Contents of file2
</document>
</documents>
"""
assert expected.strip() == actual.strip()
def test_cxml_format_multiple_paths(tmpdir):
runner = CliRunner()
with tmpdir.as_cwd():
os.makedirs("test_dir")
with open("test_dir/file1.txt", "w") as f:
f.write("Contents of file1")
with open("test_dir/file2.txt", "w") as f:
f.write("Contents of file2")
result = runner.invoke(
cli, ["test_dir/file1.txt", "test_dir/file2.txt", "--cxml"]
)
assert result.exit_code == 0
actual = result.output
expected = """
<documents>
<document path="test_dir/file1.txt">
Contents of file1
</document>
<document path="test_dir/file2.txt">
Contents of file2
</document>
</documents>
"""
assert expected.strip() == actual.strip()