New gitignore implementation based on pathspec

Refs #40
This commit is contained in:
Simon Willison
2025-02-18 21:48:03 -08:00
parent 6164edf25c
commit e853b6df25
5 changed files with 141 additions and 42 deletions

View File

@@ -1,7 +1,8 @@
import os
import sys
from fnmatch import fnmatch
from files_to_prompt.utils import allowed_by_gitignore
import pathlib
import click
global_index = 1
@@ -24,25 +25,6 @@ EXT_TO_LANG = {
}
def should_ignore(path, gitignore_rules):
for rule in gitignore_rules:
if fnmatch(os.path.basename(path), rule):
return True
if os.path.isdir(path) and fnmatch(os.path.basename(path) + "/", rule):
return True
return False
def read_gitignore(path):
gitignore_path = os.path.join(path, ".gitignore")
if os.path.isfile(gitignore_path):
with open(gitignore_path, "r") as f:
return [
line.strip() for line in f if line.strip() and not line.startswith("#")
]
return []
def add_line_numbers(content):
lines = content.splitlines()
@@ -104,7 +86,6 @@ def process_path(
include_hidden,
ignore_files_only,
ignore_gitignore,
gitignore_rules,
ignore_patterns,
writer,
claude_xml,
@@ -124,17 +105,13 @@ def process_path(
dirs[:] = [d for d in dirs if not d.startswith(".")]
files = [f for f in files if not f.startswith(".")]
root_path = pathlib.Path(root)
if not ignore_gitignore:
gitignore_rules.extend(read_gitignore(root))
dirs[:] = [
d
for d in dirs
if not should_ignore(os.path.join(root, d), gitignore_rules)
d for d in dirs if allowed_by_gitignore(root_path, root_path / d)
]
files = [
f
for f in files
if not should_ignore(os.path.join(root, f), gitignore_rules)
f for f in files if allowed_by_gitignore(root_path, root_path / f)
]
if ignore_patterns:
@@ -302,7 +279,6 @@ def cli(
# Combine paths from arguments and stdin
paths = [*paths, *stdin_paths]
gitignore_rules = []
writer = click.echo
fp = None
if output_file:
@@ -311,8 +287,6 @@ def cli(
for path in paths:
if not os.path.exists(path):
raise click.BadArgumentUsage(f"Path does not exist: {path}")
if not ignore_gitignore:
gitignore_rules.extend(read_gitignore(os.path.dirname(path)))
if claude_xml and path == paths[0]:
writer("<documents>")
process_path(
@@ -321,7 +295,6 @@ def cli(
include_hidden,
ignore_files_only,
ignore_gitignore,
gitignore_rules,
ignore_patterns,
writer,
claude_xml,

75
files_to_prompt/utils.py Normal file
View File

@@ -0,0 +1,75 @@
from pathlib import Path
from pathspec.gitignore import GitIgnoreSpec
def allowed_by_gitignore(root: Path, file_path: Path) -> bool:
"""
Check whether the file (file_path) should be included (i.e. not ignored)
based on all .gitignore files encountered from the root directory down to
the directory where the file resides.
Parameters:
root (Path): The root directory under which .gitignore files are searched.
file_path (Path): The file to be checked.
Returns:
bool: True if the file should be included (not ignored); False if it should be ignored.
"""
# Resolve absolute paths.
abs_root = root.resolve()
abs_file = file_path.resolve()
# Ensure file is under the provided root.
try:
_ = abs_file.relative_to(abs_root)
except ValueError:
raise ValueError(f"File {abs_file!r} is not under the root {abs_root!r}.")
# Build a list of directories from the root to the file's directory.
directories = [abs_root]
file_dir = abs_file.parent
rel_dir = file_dir.relative_to(abs_root)
for part in rel_dir.parts:
directories.append(directories[-1] / part)
# The decision will be updated by any matching .gitignore rule encountered.
decision = None
# Process each directory (from root to file's directory)
for directory in directories:
gitignore_file = directory / ".gitignore"
if gitignore_file.is_file():
try:
# Read nonempty lines (ignoring blank lines).
lines = [
line.rstrip("\n")
for line in gitignore_file.read_text(encoding="utf-8").splitlines()
if line.strip()
]
except Exception as e:
print(f"Could not read {gitignore_file}: {e}")
continue
# Compile a GitIgnoreSpec for the rules in the current directory.
spec = GitIgnoreSpec.from_lines(lines)
# .gitignore patterns are relative to the directory they are in.
# Compute the file path relative to this directory in POSIX format.
rel_file = abs_file.relative_to(directory).as_posix()
# Check the file against these rules.
result = spec.check_file(rel_file)
# If a rule from this .gitignore file applied, update the decision.
if result.include is not None:
decision = result.include
# If no .gitignore rule matched, the file is included by default.
if decision is None:
return True
# Interpretation:
# • decision == True --> a normal ignore rule matched (file should be ignored)
# • decision == False --> a negation rule matched (file re-included)
# So, we return not decision.
return not decision

View File

@@ -10,7 +10,8 @@ classifiers = [
"License :: OSI Approved :: Apache Software License"
]
dependencies = [
"click"
"click",
"pathspec",
]
[project.urls]

View File

@@ -0,0 +1,54 @@
from files_to_prompt.utils import allowed_by_gitignore
from pathlib import Path
def test_allowed_by_gitignore(tmpdir):
# Create a temporary directory structure.
base = Path(tmpdir)
repo = base / "repo"
repo.mkdir()
# Create a top-level .gitignore in repo that ignores the "build/" directory.
(repo / ".gitignore").write_text("build/\n", encoding="utf-8")
# Create a "build" subdirectory and add an output file which should be ignored.
build_dir = repo / "build"
build_dir.mkdir()
output_file = build_dir / "output.txt"
output_file.write_text("dummy build output", encoding="utf-8")
# Create a "src" subdirectory with its own .gitignore.
src_dir = repo / "src"
src_dir.mkdir()
# In src, ignore "temp.txt"
(src_dir / ".gitignore").write_text("temp.txt\n", encoding="utf-8")
# Create files in "src"
main_file = src_dir / "main.py"
main_file.write_text("print('Hello')", encoding="utf-8")
temp_file = src_dir / "temp.txt"
temp_file.write_text("should be ignored", encoding="utf-8")
keep_file = src_dir / "keep.txt"
keep_file.write_text("keep this file", encoding="utf-8")
# Create a file at repo root that is not ignored.
root_file = repo / "README.md"
root_file.write_text("# Repo README", encoding="utf-8")
# Test cases:
# 1. File in "build" should be ignored.
assert (
allowed_by_gitignore(repo, output_file) is False
), "build/output.txt should be ignored"
# 2. File in "src" that is ignored per src/.gitignore.
assert allowed_by_gitignore(repo, temp_file) is False, "src/temp.txt should be ignored"
# 3. Files in "src" not mentioned in .gitignore should be included.
assert allowed_by_gitignore(repo, main_file) is True, "src/main.py should be included"
assert allowed_by_gitignore(repo, keep_file) is True, "src/keep.txt should be included"
# 4. File at the repo root not mentioned in .gitignore.
assert (
allowed_by_gitignore(repo, root_file) is True
), "repo/README.md should be included"

View File

@@ -21,7 +21,7 @@ def test_basic_functionality(tmpdir):
with open("test_dir/file2.txt", "w") as f:
f.write("Contents of file2")
result = runner.invoke(cli, ["test_dir"])
result = runner.invoke(cli, ["test_dir"], catch_exceptions=False)
assert result.exit_code == 0
assert "test_dir/file1.txt" in result.output
assert "Contents of file1" in result.output
@@ -36,7 +36,7 @@ def test_include_hidden(tmpdir):
with open("test_dir/.hidden.txt", "w") as f:
f.write("Contents of hidden file")
result = runner.invoke(cli, ["test_dir"])
result = runner.invoke(cli, ["test_dir"], catch_exceptions=False)
assert result.exit_code == 0
assert "test_dir/.hidden.txt" not in result.output
@@ -61,11 +61,9 @@ def test_ignore_gitignore(tmpdir):
with open("test_dir/nested_include/included2.txt", "w") as f:
f.write("This nested file should be included")
with open("test_dir/nested_ignore/.gitignore", "w") as f:
f.write("nested_ignore.txt")
f.write("*")
with open("test_dir/nested_ignore/nested_ignore.txt", "w") as f:
f.write("This nested file should not be included")
with open("test_dir/nested_ignore/actually_include.txt", "w") as f:
f.write("This nested file should actually be included")
result = runner.invoke(cli, ["test_dir", "-c"])
assert result.exit_code == 0
@@ -74,7 +72,6 @@ def test_ignore_gitignore(tmpdir):
assert filenames == {
"test_dir/included.txt",
"test_dir/nested_include/included2.txt",
"test_dir/nested_ignore/actually_include.txt",
}
result2 = runner.invoke(cli, ["test_dir", "-c", "--ignore-gitignore"])
@@ -86,7 +83,6 @@ def test_ignore_gitignore(tmpdir):
"test_dir/ignored.txt",
"test_dir/nested_include/included2.txt",
"test_dir/nested_ignore/nested_ignore.txt",
"test_dir/nested_ignore/actually_include.txt",
}
@@ -243,7 +239,7 @@ def test_binary_file_warning(tmpdir):
with open("test_dir/text_file.txt", "w") as f:
f.write("This is a text file")
result = runner.invoke(cli, ["test_dir"])
result = runner.invoke(cli, ["test_dir"], catch_exceptions=False)
assert result.exit_code == 0
stdout = result.stdout
@@ -331,7 +327,7 @@ def test_line_numbers(tmpdir):
with open("test_dir/multiline.txt", "w") as f:
f.write(test_content)
result = runner.invoke(cli, ["test_dir"])
result = runner.invoke(cli, ["test_dir"], catch_exceptions=False)
assert result.exit_code == 0
assert "1 First line" not in result.output
assert test_content in result.output