Git diffs (#25)

* git diffs

* cmdline arg for git diff

* git diff validation

* change structure

* add git diff to model

* nits

* revert webhook

---------

Co-authored-by: Kartik Sarangmath <kartiksarangmath@Kartiks-MacBook-Air.local>
This commit is contained in:
ksarangmath
2025-07-21 23:38:38 -07:00
committed by GitHub
parent e5cc84202e
commit fbb4339a78
12 changed files with 336 additions and 1 deletions

View File

@@ -0,0 +1,31 @@
"""Add git_diff field to agent_instances
Revision ID: 4e77ec2a7faa
Revises: 84cd4a8c9a18
Create Date: 2025-07-21 12:44:28.861822
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = "4e77ec2a7faa"
down_revision: Union[str, None] = "84cd4a8c9a18"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("agent_instances", sa.Column("git_diff", sa.Text(), nullable=True))
# ### end Alembic commands ###
def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("agent_instances", "git_diff")
# ### end Alembic commands ###

View File

@@ -9,9 +9,11 @@ from sqlalchemy.orm import (
Mapped, # type: ignore[attr-defined]
mapped_column, # type: ignore[attr-defined]
relationship,
validates,
)
from .enums import AgentStatus
from .utils import is_valid_git_diff
if TYPE_CHECKING:
from .subscription_models import (
@@ -119,6 +121,7 @@ class AgentInstance(Base):
status: Mapped[AgentStatus] = mapped_column(default=AgentStatus.ACTIVE)
started_at: Mapped[datetime] = mapped_column(default=lambda: datetime.now(UTC))
ended_at: Mapped[datetime | None] = mapped_column(default=None)
git_diff: Mapped[str | None] = mapped_column(Text, default=None)
# Relationships
user_agent: Mapped["UserAgent"] = relationship(
@@ -137,6 +140,20 @@ class AgentInstance(Base):
order_by="AgentUserFeedback.created_at",
)
@validates("git_diff")
def validate_git_diff(self, key, value):
"""Validate git diff at the database level.
Raises ValueError if the git diff is invalid.
"""
if value is None:
return value
if not is_valid_git_diff(value):
raise ValueError("Invalid git diff format. Must be a valid unified diff.")
return value
class AgentStep(Base):
__tablename__ = "agent_steps"

88
shared/database/utils.py Normal file
View File

@@ -0,0 +1,88 @@
"""Database utility functions."""
import re
from typing import Optional
def is_valid_git_diff(diff: Optional[str]) -> bool:
"""Validate if a string is a valid git diff.
Checks for:
- Basic git diff format markers
- Proper structure
- Not just random text
Args:
diff: The string to validate as a git diff
Returns:
True if valid git diff format, False otherwise
"""
if not diff or not isinstance(diff, str):
return False
# Check for essential git diff patterns
has_diff_header = re.search(r"^diff --git", diff, re.MULTILINE) is not None
has_index_line = (
re.search(r"^index [a-f0-9]+\.\.[a-f0-9]+", diff, re.MULTILINE) is not None
)
has_file_markers = (
re.search(r"^--- ", diff, re.MULTILINE) is not None
and re.search(r"^\+\+\+ ", diff, re.MULTILINE) is not None
)
has_hunk_header = re.search(r"^@@[ \-\+,0-9]+@@", diff, re.MULTILINE) is not None
# For new files (untracked), we might not have index lines
has_new_file = re.search(r"^new file mode", diff, re.MULTILINE) is not None
# A valid diff should have:
# 1. diff --git header
# 2. Either (index line) OR (new file mode)
# 3. File markers (--- and +++)
# 4. At least one hunk header (@@)
is_valid = (
has_diff_header
and (has_index_line or has_new_file)
and has_file_markers
and has_hunk_header
)
# Additional check: should have some actual diff content (lines starting with +, -, or space)
has_diff_content = re.search(r"^[ \+\-]", diff, re.MULTILINE) is not None
return is_valid and has_diff_content
def sanitize_git_diff(diff: Optional[str]) -> Optional[str]:
"""Sanitize and validate a git diff for storage.
Args:
diff: The git diff string to sanitize (None means no update needed)
Returns:
- Original diff string if valid
- Empty string if diff is empty (clears the git diff)
- None if diff is invalid or not provided
"""
if diff is None:
return None
# Strip excessive whitespace
diff = diff.strip()
# If empty after stripping, return empty string (valid case)
if not diff:
return ""
# Check if it's a valid git diff
if not is_valid_git_diff(diff):
return None
# Limit size to prevent abuse (1MB)
max_size = 1024 * 1024 # 1MB
if len(diff) > max_size:
# Truncate and add marker
diff = diff[: max_size - 100] + "\n\n... [TRUNCATED - DIFF TOO LARGE] ..."
return diff