Add --cxml flag (#16)

Refs #15

---------

Co-authored-by: Simon Willison <swillison@gmail.com>
This commit is contained in:
Lex Herbert
2024-09-08 22:19:59 -07:00
committed by GitHub
parent 4810ef7880
commit db4a164fec
4 changed files with 144 additions and 18 deletions

1
.gitignore vendored
View File

@@ -7,3 +7,4 @@ venv
.pytest_cache
*.egg-info
.DS_Store
build/

View File

@@ -12,6 +12,7 @@ For background on this project see [Building files-to-prompt entirely using Clau
## Installation
Install this tool using `pip`:
```bash
pip install files-to-prompt
```
@@ -29,11 +30,13 @@ This will output the contents of every file, with each file preceded by its rela
### Options
- `--include-hidden`: Include files and folders starting with `.` (hidden files and directories).
```bash
files-to-prompt path/to/directory --include-hidden
```
- `--ignore-gitignore`: Ignore `.gitignore` files and include all files.
```bash
files-to-prompt path/to/directory --ignore-gitignore
```
@@ -101,6 +104,25 @@ Contents of file3.txt
---
```
### XML Output
Anthropic has provided [specific guidelines](https://docs.anthropic.com/claude/docs/long-context-window-tips) for optimally structuring prompts to take advantage of Claude's extended context window.
To structure the output in this way, use the optional `--cxml` flag, which will produce output like this:
```xml
<documents>
<document path="my_directory/file1.txt">
Contents of file1.txt
</document>
<document path="my_directory/file2.txt">
Contents of file2.txt
</document>
...
</documents>
```
## Development
To contribute to this tool, first checkout the code. Then create a new virtual environment:
@@ -118,6 +140,7 @@ pip install -e '.[test]'
```
To run the tests:
```bash
pytest
```

View File

@@ -1,7 +1,8 @@
import os
import click
from fnmatch import fnmatch
import click
def should_ignore(path, gitignore_rules):
for rule in gitignore_rules:
@@ -22,18 +23,39 @@ def read_gitignore(path):
return []
def print_path(path, content, xml):
if xml:
print_as_xml(path, content)
else:
print_default(path, content)
def print_default(path, content):
click.echo(path)
click.echo("---")
click.echo(content)
click.echo()
click.echo("---")
def print_as_xml(path, content):
click.echo(f'<document path="{path}">')
click.echo(content)
click.echo("</document>")
def process_path(
path, include_hidden, ignore_gitignore, gitignore_rules, ignore_patterns
path,
include_hidden,
ignore_gitignore,
gitignore_rules,
ignore_patterns,
claude_xml,
):
if os.path.isfile(path):
try:
with open(path, "r") as f:
file_contents = f.read()
click.echo(path)
click.echo("---")
click.echo(file_contents)
click.echo()
click.echo("---")
print_path(path, f.read(), claude_xml)
except UnicodeDecodeError:
warning_message = f"Warning: Skipping file {path} due to UnicodeDecodeError"
click.echo(click.style(warning_message, fg="red"), err=True)
@@ -63,17 +85,11 @@ def process_path(
if not any(fnmatch(f, pattern) for pattern in ignore_patterns)
]
for file in files:
for file in sorted(files):
file_path = os.path.join(root, file)
try:
with open(file_path, "r") as f:
file_contents = f.read()
click.echo(file_path)
click.echo("---")
click.echo(file_contents)
click.echo()
click.echo("---")
print_path(file_path, f.read(), claude_xml)
except UnicodeDecodeError:
warning_message = (
f"Warning: Skipping file {file_path} due to UnicodeDecodeError"
@@ -100,8 +116,15 @@ def process_path(
default=[],
help="List of patterns to ignore",
)
@click.option(
"claude_xml",
"-c",
"--cxml",
is_flag=True,
help="Output in XML-ish format suitable for Claude's long context window.",
)
@click.version_option()
def cli(paths, include_hidden, ignore_gitignore, ignore_patterns):
def cli(paths, include_hidden, ignore_gitignore, ignore_patterns, claude_xml):
"""
Takes one or more paths to files or directories and outputs every file,
recursively, each one preceded with its filename like this:
@@ -114,6 +137,19 @@ def cli(paths, include_hidden, ignore_gitignore, ignore_patterns):
path/to/file2.py
---
...
If the `--cxml` flag is provided, the output will be structured as follows:
<documents>
<document path="path/to/file1.txt">
Contents of file1.txt
</document>
<document path="path/to/file2.txt">
Contents of file2.txt
</document>
...
</documents>
"""
gitignore_rules = []
for path in paths:
@@ -121,6 +157,17 @@ def cli(paths, include_hidden, ignore_gitignore, ignore_patterns):
raise click.BadArgumentUsage(f"Path does not exist: {path}")
if not ignore_gitignore:
gitignore_rules.extend(read_gitignore(os.path.dirname(path)))
if claude_xml and path == paths[0]:
click.echo("<documents>")
process_path(
path, include_hidden, ignore_gitignore, gitignore_rules, ignore_patterns
path,
include_hidden,
ignore_gitignore,
gitignore_rules,
ignore_patterns,
claude_xml,
)
if claude_xml:
click.echo("</documents>")

View File

@@ -1,5 +1,7 @@
import os
from click.testing import CliRunner
from files_to_prompt.cli import cli
@@ -186,3 +188,56 @@ def test_binary_file_warning(tmpdir):
"Warning: Skipping file test_dir/binary_file.bin due to UnicodeDecodeError"
in stderr
)
def test_xml_format_dir(tmpdir):
runner = CliRunner()
with tmpdir.as_cwd():
os.makedirs("test_dir")
with open("test_dir/file1.txt", "w") as f:
f.write("Contents of file1")
with open("test_dir/file2.txt", "w") as f:
f.write("Contents of file2")
result = runner.invoke(cli, ["test_dir", "--cxml"])
assert result.exit_code == 0
actual = result.output
expected = """
<documents>
<document path="test_dir/file1.txt">
Contents of file1
</document>
<document path="test_dir/file2.txt">
Contents of file2
</document>
</documents>
"""
assert expected.strip() == actual.strip()
def test_cxml_format_multiple_paths(tmpdir):
runner = CliRunner()
with tmpdir.as_cwd():
os.makedirs("test_dir")
with open("test_dir/file1.txt", "w") as f:
f.write("Contents of file1")
with open("test_dir/file2.txt", "w") as f:
f.write("Contents of file2")
result = runner.invoke(
cli, ["test_dir/file1.txt", "test_dir/file2.txt", "--cxml"]
)
assert result.exit_code == 0
actual = result.output
expected = """
<documents>
<document path="test_dir/file1.txt">
Contents of file1
</document>
<document path="test_dir/file2.txt">
Contents of file2
</document>
</documents>
"""
assert expected.strip() == actual.strip()