From b1655e6e9fa187b75e4c4836d0b697afc4b02ca8 Mon Sep 17 00:00:00 2001 From: Max Parke Date: Mon, 29 Jun 2026 21:01:37 -0400 Subject: [PATCH 1/3] feat(lib): capture client-attested build provenance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add agentex.lib.utils.build_provenance — the single producer of source identity for agent builds (git coordinates + a deterministic content hash of the build context). prepare_cloud_build_context now writes build-info.json into the staged context (populates runtime registration_metadata with no server change) and exposes provenance on CloudBuildContext so the upload can send source_* fields. Archive member order is now deterministic via a sorted enumeration shared with the hash. The hash is computed only when there is no clean commit to identify the build (dirty tree or non-git context). First of three surfaces for AGX1-418 (Phase 1, client-attested); the SGP build-record columns and the sgpctl/Gitea uploaders follow. Co-Authored-By: Claude Opus 4.8 --- .../lib/cli/handlers/agent_handlers.py | 24 +- src/agentex/lib/sdk/config/agent_manifest.py | 10 +- src/agentex/lib/utils/build_provenance.py | 224 ++++++++++++++++ tests/lib/cli/test_agent_handlers.py | 19 ++ tests/lib/test_build_provenance.py | 242 ++++++++++++++++++ 5 files changed, 513 insertions(+), 6 deletions(-) create mode 100644 src/agentex/lib/utils/build_provenance.py create mode 100644 tests/lib/test_build_provenance.py diff --git a/src/agentex/lib/cli/handlers/agent_handlers.py b/src/agentex/lib/cli/handlers/agent_handlers.py index 1f2ccc7ef..a224397ba 100644 --- a/src/agentex/lib/cli/handlers/agent_handlers.py +++ b/src/agentex/lib/cli/handlers/agent_handlers.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json from typing import NamedTuple from pathlib import Path @@ -8,12 +9,15 @@ from agentex.lib.cli.debug import DebugConfig from agentex.lib.utils.logging import make_logger +from agentex.lib.utils.build_provenance import BuildProvenance, capture_build_provenance from agentex.lib.cli.handlers.run_handlers import RunError, run_agent as _run_agent from agentex.lib.sdk.config.agent_manifest import BuildContextManager, load_agent_manifest, build_context_manager logger = make_logger(__name__) console = Console() +_BUILD_INFO_FILENAME = "build-info.json" + class DockerBuildError(Exception): """An error occurred during docker build""" @@ -28,6 +32,7 @@ class CloudBuildContext(NamedTuple): tag: str image_name: str build_context_size_kb: float + provenance: BuildProvenance def build_agent( @@ -261,8 +266,24 @@ def prepare_cloud_build_context( logger.info("Preparing build context...") with build_context_manager(agent_manifest, build_context_root) as build_context: + staged_root = Path(build_context.path) + # Capture source identity over the staged (post-.dockerignore) tree — the + # exact bytes that ship — then write build-info.json into it so it lands + # in the image for runtime registration. Capture runs before the write so + # the content hash never includes build-info.json itself. + provenance = capture_build_provenance( + repo_path=build_context_root, + context_root=build_context_root, + content_root=staged_root, + ) + (staged_root / _BUILD_INFO_FILENAME).write_text(json.dumps(provenance.build_info(), indent=2, sort_keys=True)) + logger.info( + f"Build provenance: commit={provenance.commit} ref={provenance.ref} " + f"clean_commit={provenance.is_clean_commit}" + ) + # Compress the prepared context using the static zipped method - with BuildContextManager.zipped(root_path=build_context.path) as archive_buffer: + with BuildContextManager.zipped(root_path=staged_root) as archive_buffer: archive_bytes = archive_buffer.read() build_context_size_kb = len(archive_bytes) / 1024 @@ -275,4 +296,5 @@ def prepare_cloud_build_context( tag=tag, image_name=image_name, build_context_size_kb=build_context_size_kb, + provenance=provenance, ) diff --git a/src/agentex/lib/sdk/config/agent_manifest.py b/src/agentex/lib/sdk/config/agent_manifest.py index fd743e635..c2fe03052 100644 --- a/src/agentex/lib/sdk/config/agent_manifest.py +++ b/src/agentex/lib/sdk/config/agent_manifest.py @@ -24,6 +24,7 @@ from agentex.lib.utils.io import load_yaml_file from agentex.lib.utils.logging import make_logger from agentex.config.agent_manifest import AgentManifest # noqa: F401 +from agentex.lib.utils.build_provenance import iter_context_files logger = make_logger(__name__) @@ -189,12 +190,11 @@ def zipped(root_path: Path | None = None) -> Iterator[IO[bytes]]: tar_buffer = io.BytesIO() + # Sorted, relpath-stable enumeration (shared with the content hash) so the + # archive's member order is deterministic across machines. with tarfile.open(fileobj=tar_buffer, mode="w:gz") as tar_file: - for path in Path(root_path).rglob( - "*" - ): # Recursively add files to the tar.gz - if path.is_file(): # Ensure that we're only adding files - tar_file.add(path, arcname=path.relative_to(root_path)) + for path in iter_context_files(Path(root_path)): + tar_file.add(path, arcname=path.relative_to(root_path)) tar_buffer.seek(0) # Reset the buffer position to the beginning yield tar_buffer diff --git a/src/agentex/lib/utils/build_provenance.py b/src/agentex/lib/utils/build_provenance.py new file mode 100644 index 000000000..554d5bdd2 --- /dev/null +++ b/src/agentex/lib/utils/build_provenance.py @@ -0,0 +1,224 @@ +"""Client-attested build provenance capture (AGX1-418). + +The single producer of source identity for agent builds: git coordinates plus a +deterministic content hash of the build context. Every build path (CLI, sgpctl, +CI) imports this so capture logic and the ``working_tree_hash`` definition live +in exactly one place. Capture is best-effort — a missing/odd git state degrades +to nulls and never raises into a build. +""" + +from __future__ import annotations + +import os +import stat +import hashlib +import subprocess +from typing import Optional +from pathlib import Path +from datetime import datetime, timezone +from dataclasses import dataclass + +from agentex.lib.utils.logging import make_logger + +logger = make_logger(__name__) + +_GIT_TIMEOUT_S = 5 +_HASH_CHUNK_BYTES = 1 << 20 + + +@dataclass(frozen=True) +class BuildProvenance: + """Source identity for one build. All fields degrade to ``None``. + + Exactly one identity anchors the build: a **clean committed tree** keys on + ``commit`` (``working_tree_hash`` is ``None``); anything else — a dirty tree + or a non-git context, neither of which a commit can address — carries a + ``working_tree_hash`` instead. So a non-null hash means "no clean commit to + point to," and ``is_clean_commit`` is the gate ``--require-clean`` checks. + """ + + repo: Optional[str] = None + commit: Optional[str] = None + ref: Optional[str] = None + subpath: Optional[str] = None + working_tree_hash: Optional[str] = None + author_name: Optional[str] = None + author_email: Optional[str] = None + build_timestamp: Optional[str] = None + + @property + def is_clean_commit(self) -> bool: + return self.commit is not None and self.working_tree_hash is None + + def source_fields(self) -> dict[str, str]: + """The ``source_*`` form fields for the cloud-build upload (None omitted).""" + fields = { + "source_repo": self.repo, + "source_commit": self.commit, + "source_ref": self.ref, + "source_subpath": self.subpath, + "working_tree_hash": self.working_tree_hash, + } + return {key: value for key, value in fields.items() if value is not None} + + def build_info(self) -> dict[str, str]: + """The ``build-info.json`` payload (runtime ``registration_metadata``). + + Overlapping keys match the server's ``DeploymentHistory`` type + (``commit_hash`` / ``branch_name`` / ``author_*`` / ``build_timestamp``), + which is populated from ``registration_metadata``; the rest are the + provenance-specific coordinates. + """ + info = { + "repo": self.repo, + "commit_hash": self.commit, + "branch_name": self.ref, + "subpath": self.subpath, + "working_tree_hash": self.working_tree_hash, + "author_name": self.author_name, + "author_email": self.author_email, + "build_timestamp": self.build_timestamp, + } + return {key: value for key, value in info.items() if value is not None} + + +def _git(repo_root: Path, *args: str) -> Optional[str]: + """Run a git command under ``repo_root``; return stripped stdout or None.""" + try: + proc = subprocess.run( + ("git", "-C", str(repo_root), *args), + capture_output=True, + text=True, + timeout=_GIT_TIMEOUT_S, + check=False, + ) + except (OSError, subprocess.SubprocessError): + return None + if proc.returncode != 0: + return None + return proc.stdout.strip() or None + + +def normalize_remote(url: Optional[str]) -> Optional[str]: + """Canonicalize a git remote to ``host/path`` — credentials and scheme stripped. + + ``git@github.com:org/repo.git`` and ``https://x:tok@github.com/org/repo.git`` + both normalize to ``github.com/org/repo``. Host is lowercased; path casing is + preserved (repo paths can be case-significant). + """ + if not url: + return None + candidate = url.strip() + # scp-like syntax: git@host:org/repo(.git) — no scheme, host/path split on ':' + if "://" not in candidate and ":" in candidate and "/" not in candidate.split(":", 1)[0]: + candidate = candidate.split("@", 1)[-1].replace(":", "/", 1) + else: + if "://" in candidate: + candidate = candidate.split("://", 1)[1] + candidate = candidate.split("@", 1)[-1] + if candidate.endswith(".git"): + candidate = candidate[: -len(".git")] + candidate = candidate.strip("/") + if not candidate: + return None + host, slash, path = candidate.partition("/") + return f"{host.lower()}{slash}{path}" + + +def _sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with open(path, "rb") as handle: + while chunk := handle.read(_HASH_CHUNK_BYTES): + digest.update(chunk) + return digest.hexdigest() + + +def iter_context_files(root: Path) -> list[Path]: + """Files (and symlinks) under ``root``, sorted by POSIX relpath. + + The canonical, order-stable enumeration shared by the content hash and the + archive packer so the two can never drift on which files they cover. + """ + return sorted( + (path for path in root.rglob("*") if path.is_symlink() or path.is_file()), + key=lambda path: path.relative_to(root).as_posix(), + ) + + +def working_tree_hash(root: Path) -> str: + """Deterministic content hash of the build context at ``root``. + + sha256 over the sorted ``(relpath, normalized mode, content digest)`` of every + file — the build *inputs*, not the tarball (tar/gzip framing is + non-deterministic and would defeat dedupe). Mode is normalized to the + executable bit; symlinks hash their target string, not the resolved content. + """ + lines: list[str] = [] + for path in iter_context_files(root): + relpath = path.relative_to(root).as_posix() + if path.is_symlink(): + mode = "120000" + content_digest = hashlib.sha256(os.readlink(path).encode("utf-8")).hexdigest() + else: + executable = bool(path.stat().st_mode & stat.S_IXUSR) + mode = "100755" if executable else "100644" + content_digest = _sha256_file(path) + lines.append(f"{relpath}\x00{mode}\x00{content_digest}") + return hashlib.sha256("\n".join(lines).encode("utf-8")).hexdigest() + + +def capture_build_provenance( + repo_path: Path, context_root: Path, content_root: Optional[Path] = None +) -> BuildProvenance: + """Capture source identity for a build of ``context_root``. + + ``repo_path`` is where git is interrogated and ``subpath`` is ``context_root`` + relative to the repo root (which agent, in a monorepo). ``content_root`` is + the directory hashed — the *staged*, post-``.dockerignore`` tree that actually + ships; it defaults to ``context_root`` when there is no separate staging dir. + The content hash is computed unless a clean commit identifies the build (so: + for a dirty tree or a non-git context, but not for a clean committed tree). + """ + timestamp = datetime.now(timezone.utc).isoformat() + hash_root = content_root if content_root is not None else context_root + repo_root = _git(repo_path, "rev-parse", "--show-toplevel") + if repo_root is None: + # No git at all — the content hash is the only identity available. + logger.info("build-provenance: %s is not a git work tree; hashing context", repo_path) + return BuildProvenance( + working_tree_hash=working_tree_hash(hash_root), + build_timestamp=timestamp, + ) + + repo_root_path = Path(repo_root) + commit = _git(repo_root_path, "rev-parse", "HEAD") + # symbolic-ref fails on a detached HEAD (→ None); fall back to an exact tag. + ref = _git(repo_root_path, "symbolic-ref", "--short", "HEAD") or _git( + repo_root_path, "describe", "--tags", "--exact-match" + ) + remote = normalize_remote(_git(repo_root_path, "remote", "get-url", "origin")) + author_name = _git(repo_root_path, "log", "-1", "--format=%an") + author_email = _git(repo_root_path, "log", "-1", "--format=%ae") + + subpath: Optional[str] = None + try: + relative = context_root.resolve().relative_to(repo_root_path.resolve()).as_posix() + subpath = relative if relative != "." else None + except ValueError: + subpath = None + + # Hash unless a clean commit identifies the build: dirty tree, or an unborn + # HEAD with no commit yet, both fall back to the content hash. + dirty = _git(repo_root_path, "status", "--porcelain") is not None + tree_hash = working_tree_hash(hash_root) if (dirty or commit is None) else None + + return BuildProvenance( + repo=remote, + commit=commit, + ref=ref, + subpath=subpath, + working_tree_hash=tree_hash, + author_name=author_name, + author_email=author_email, + build_timestamp=timestamp, + ) diff --git a/tests/lib/cli/test_agent_handlers.py b/tests/lib/cli/test_agent_handlers.py index 73c29cfbb..768f6a19c 100644 --- a/tests/lib/cli/test_agent_handlers.py +++ b/tests/lib/cli/test_agent_handlers.py @@ -2,7 +2,9 @@ from __future__ import annotations +import io import os +import json import tarfile import tempfile from pathlib import Path @@ -145,6 +147,23 @@ def test_prepare_cloud_build_context_returns_cloud_build_context( assert len(result.archive_bytes) > 0 assert result.build_context_size_kb > 0 + def test_prepare_cloud_build_context_writes_build_info(self, temp_agent_dir: Path): + """build-info.json ships in the archive and matches the captured provenance.""" + manifest_path = str(temp_agent_dir / "manifest.yaml") + + result = prepare_cloud_build_context(manifest_path=manifest_path) + + # Non-git temp dir → the content hash is the identity, no commit. + assert result.provenance.commit is None + assert result.provenance.working_tree_hash is not None + + with tarfile.open(fileobj=io.BytesIO(result.archive_bytes), mode="r:gz") as archive: + build_info_name = next(n for n in archive.getnames() if n.endswith("build-info.json")) + member = archive.extractfile(build_info_name) + assert member is not None + shipped = json.loads(member.read()) + assert shipped == result.provenance.build_info() + def test_prepare_cloud_build_context_with_tag_override(self, temp_agent_dir: Path): """Test that tag parameter overrides manifest tag.""" manifest_path = str(temp_agent_dir / "manifest.yaml") diff --git a/tests/lib/test_build_provenance.py b/tests/lib/test_build_provenance.py new file mode 100644 index 000000000..2a765276c --- /dev/null +++ b/tests/lib/test_build_provenance.py @@ -0,0 +1,242 @@ +from __future__ import annotations + +import subprocess +from pathlib import Path + +import pytest + +from agentex.lib.utils.build_provenance import ( + normalize_remote, + working_tree_hash, + iter_context_files, + capture_build_provenance, +) + + +def _git(repo: Path, *args: str) -> None: + subprocess.run(("git", "-C", str(repo), *args), check=True, capture_output=True, text=True) + + +def _init_repo(path: Path, *, remote: str | None = "git@github.com:scaleapi/demo.git") -> Path: + path.mkdir(parents=True, exist_ok=True) + _git(path, "init", "-q") + _git(path, "config", "user.email", "dev@scale.com") + _git(path, "config", "user.name", "Dev") + _git(path, "config", "commit.gpgsign", "false") + if remote: + _git(path, "remote", "add", "origin", remote) + return path + + +def _commit_all(path: Path, message: str = "init") -> None: + _git(path, "add", "-A") + _git(path, "commit", "-q", "-m", message) + _git(path, "branch", "-M", "main") + + +def _write(root: Path, rel: str, content: str = "x") -> None: + target = root / rel + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(content) + + +# --- normalize_remote --------------------------------------------------------- + + +@pytest.mark.parametrize( + ("raw", "expected"), + [ + ("git@github.com:scaleapi/Repo.git", "github.com/scaleapi/Repo"), + ("https://github.com/scaleapi/Repo.git", "github.com/scaleapi/Repo"), + ("https://x-token:secret@GitHub.com/scaleapi/Repo", "github.com/scaleapi/Repo"), + ("ssh://git@gitlab.com/group/sub/proj.git", "gitlab.com/group/sub/proj"), + ("", None), + (None, None), + ], +) +def test_normalize_remote(raw: str | None, expected: str | None) -> None: + assert normalize_remote(raw) == expected + + +# --- working_tree_hash -------------------------------------------------------- + + +def test_hash_is_order_independent(tmp_path: Path) -> None: + first = tmp_path / "a" + second = tmp_path / "b" + for rel in ("z.txt", "a/b.txt", "m.txt"): + _write(first, rel, rel) + # Same content, different creation order. + for rel in ("m.txt", "z.txt", "a/b.txt"): + _write(second, rel, rel) + assert working_tree_hash(first) == working_tree_hash(second) + + +def test_hash_changes_on_one_byte(tmp_path: Path) -> None: + root = tmp_path / "ctx" + _write(root, "f.txt", "hello") + before = working_tree_hash(root) + _write(root, "f.txt", "hellp") + assert working_tree_hash(root) != before + + +def test_hash_changes_when_file_added(tmp_path: Path) -> None: + root = tmp_path / "ctx" + _write(root, "f.txt", "hello") + before = working_tree_hash(root) + _write(root, "g.txt", "new") + assert working_tree_hash(root) != before + + +def test_hash_changes_on_executable_bit(tmp_path: Path) -> None: + root = tmp_path / "ctx" + script = root / "run.sh" + _write(root, "run.sh", "#!/bin/sh\n") + before = working_tree_hash(root) + script.chmod(0o755) + assert working_tree_hash(root) != before + + +def test_symlink_hashes_target_not_resolved_content(tmp_path: Path) -> None: + root = tmp_path / "ctx" + root.mkdir() + # Dangling symlinks: distinct hashes prove the target string is hashed, not + # resolved content (resolving would raise). + (root / "link").symlink_to("points/to/a") + hash_a = working_tree_hash(root) + (root / "link").unlink() + (root / "link").symlink_to("points/to/b") + assert working_tree_hash(root) != hash_a + + +def test_iter_context_files_skips_directories(tmp_path: Path) -> None: + root = tmp_path / "ctx" + _write(root, "pkg/mod.py", "x") + _write(root, "top.txt", "y") + rels = [path.relative_to(root).as_posix() for path in iter_context_files(root)] + assert rels == ["pkg/mod.py", "top.txt"] + + +# --- capture_build_provenance ------------------------------------------------- + + +def test_capture_clean_tree(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + _write(repo, "main.py", "print(1)") + _commit_all(repo) + + prov = capture_build_provenance(repo, repo) + + assert prov.repo == "github.com/scaleapi/demo" + assert prov.ref == "main" + assert prov.commit is not None and len(prov.commit) == 40 + assert prov.working_tree_hash is None + assert prov.is_clean_commit is True + assert prov.subpath is None + assert prov.author_email == "dev@scale.com" + + +def test_capture_dirty_tracked_modification(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + _write(repo, "main.py", "print(1)") + _commit_all(repo) + _write(repo, "main.py", "print(2)") # modify tracked, do not commit + + prov = capture_build_provenance(repo, repo) + + assert prov.is_clean_commit is False + assert prov.working_tree_hash is not None + assert prov.commit is not None # commit still recorded alongside the hash + + +def test_capture_dirty_untracked_file_changes_hash(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + _write(repo, "main.py", "print(1)") + _commit_all(repo) + _write(repo, "scratch.py", "debug = True") # untracked + + prov = capture_build_provenance(repo, repo) + + # The stale-code guard: an untracked file is part of the build context, so it + # must move the hash (a `git diff` of tracked files alone would miss it). + assert prov.is_clean_commit is False + assert prov.working_tree_hash == working_tree_hash(repo) + assert working_tree_hash(repo) != _hash_without(repo, "scratch.py") + + +def _hash_without(repo: Path, rel: str) -> str: + removed = repo / rel + saved = removed.read_text() + removed.unlink() + try: + return working_tree_hash(repo) + finally: + removed.write_text(saved) + + +def test_capture_detached_head_has_no_ref(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + _write(repo, "main.py", "print(1)") + _commit_all(repo) + _write(repo, "main.py", "print(2)") + _git(repo, "add", "-A") + _git(repo, "commit", "-q", "-m", "second") + first = subprocess.run( + ("git", "-C", str(repo), "rev-list", "--max-parents=0", "HEAD"), + check=True, + capture_output=True, + text=True, + ).stdout.strip() + _git(repo, "checkout", "-q", first) + + prov = capture_build_provenance(repo, repo) + + assert prov.commit == first + assert prov.ref is None + + +def test_capture_detached_on_tag_uses_tag(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + _write(repo, "main.py", "print(1)") + _commit_all(repo) + _git(repo, "tag", "v1.2.3") + _git(repo, "checkout", "-q", "v1.2.3") + + assert capture_build_provenance(repo, repo).ref == "v1.2.3" + + +def test_capture_no_remote(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo", remote=None) + _write(repo, "main.py", "print(1)") + _commit_all(repo) + + prov = capture_build_provenance(repo, repo) + + assert prov.repo is None + assert prov.commit is not None + assert prov.is_clean_commit is True # no remote, but a clean commit still anchors it + + +def test_capture_non_git_dir(tmp_path: Path) -> None: + plain = tmp_path / "plain" + _write(plain, "main.py", "print(1)") + + prov = capture_build_provenance(plain, plain) + + assert prov.repo is None + assert prov.commit is None + assert prov.ref is None + # No commit to point to → the content hash is the identity. + assert prov.working_tree_hash == working_tree_hash(plain) + assert prov.is_clean_commit is False + assert prov.build_timestamp is not None + + +def test_capture_monorepo_subpath(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + _write(repo, "agents/foo/main.py", "print(1)") + _commit_all(repo) + + prov = capture_build_provenance(repo, repo / "agents" / "foo") + + assert prov.subpath == "agents/foo" From cf9994db079318e4b6919cedafe857ff0bd436e9 Mon Sep 17 00:00:00 2001 From: Max Parke Date: Mon, 29 Jun 2026 21:47:07 -0400 Subject: [PATCH 2/3] fix(lib): always hash build context; record dirty flag (Greptile) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address Greptile review on the build-provenance capture util: - Always compute working_tree_hash (drop the "skip on clean commit" path). A `git status` clean tree can still contain .gitignore'd-but-not- .dockerignore'd files the commit can't reproduce; an always-present content hash identifies the exact shipped bytes and closes that gap. - Guard the hash (_safe_working_tree_hash) so a permission error or filesystem race degrades to None instead of aborting the build — the module contract is that capture never raises into a build. - Record dirtiness as a first-class `dirty` flag (surfaced as `source_dirty` / `dirty`) rather than overloading hash-presence, matching Go's vcs.modified and Nix's dirtyRev. None outside a git work tree. Co-Authored-By: Claude Opus 4.8 --- .../lib/cli/handlers/agent_handlers.py | 2 +- src/agentex/lib/utils/build_provenance.py | 56 +++++++++++-------- tests/lib/test_build_provenance.py | 43 +++++++------- 3 files changed, 57 insertions(+), 44 deletions(-) diff --git a/src/agentex/lib/cli/handlers/agent_handlers.py b/src/agentex/lib/cli/handlers/agent_handlers.py index a224397ba..d390e967f 100644 --- a/src/agentex/lib/cli/handlers/agent_handlers.py +++ b/src/agentex/lib/cli/handlers/agent_handlers.py @@ -279,7 +279,7 @@ def prepare_cloud_build_context( (staged_root / _BUILD_INFO_FILENAME).write_text(json.dumps(provenance.build_info(), indent=2, sort_keys=True)) logger.info( f"Build provenance: commit={provenance.commit} ref={provenance.ref} " - f"clean_commit={provenance.is_clean_commit}" + f"working_tree_hash={provenance.working_tree_hash}" ) # Compress the prepared context using the static zipped method diff --git a/src/agentex/lib/utils/build_provenance.py b/src/agentex/lib/utils/build_provenance.py index 554d5bdd2..af17b03c9 100644 --- a/src/agentex/lib/utils/build_provenance.py +++ b/src/agentex/lib/utils/build_provenance.py @@ -28,13 +28,13 @@ @dataclass(frozen=True) class BuildProvenance: - """Source identity for one build. All fields degrade to ``None``. + """Source identity for one build; every field degrades to ``None``. - Exactly one identity anchors the build: a **clean committed tree** keys on - ``commit`` (``working_tree_hash`` is ``None``); anything else — a dirty tree - or a non-git context, neither of which a commit can address — carries a - ``working_tree_hash`` instead. So a non-null hash means "no clean commit to - point to," and ``is_clean_commit`` is the gate ``--require-clean`` checks. + ``working_tree_hash`` is the deterministic content hash of the build context + and is always computed — it identifies the exact bytes that shipped, + independent of git state. ``commit`` (+ ``ref`` / ``repo``) anchor those bytes + to source, and ``dirty`` records whether the work tree had uncommitted changes + at build time (``None`` outside a git work tree). """ repo: Optional[str] = None @@ -42,15 +42,12 @@ class BuildProvenance: ref: Optional[str] = None subpath: Optional[str] = None working_tree_hash: Optional[str] = None + dirty: Optional[bool] = None author_name: Optional[str] = None author_email: Optional[str] = None build_timestamp: Optional[str] = None - @property - def is_clean_commit(self) -> bool: - return self.commit is not None and self.working_tree_hash is None - - def source_fields(self) -> dict[str, str]: + def source_fields(self) -> dict[str, object]: """The ``source_*`` form fields for the cloud-build upload (None omitted).""" fields = { "source_repo": self.repo, @@ -58,10 +55,11 @@ def source_fields(self) -> dict[str, str]: "source_ref": self.ref, "source_subpath": self.subpath, "working_tree_hash": self.working_tree_hash, + "source_dirty": self.dirty, } return {key: value for key, value in fields.items() if value is not None} - def build_info(self) -> dict[str, str]: + def build_info(self) -> dict[str, object]: """The ``build-info.json`` payload (runtime ``registration_metadata``). Overlapping keys match the server's ``DeploymentHistory`` type @@ -75,6 +73,7 @@ def build_info(self) -> dict[str, str]: "branch_name": self.ref, "subpath": self.subpath, "working_tree_hash": self.working_tree_hash, + "dirty": self.dirty, "author_name": self.author_name, "author_email": self.author_email, "build_timestamp": self.build_timestamp, @@ -167,6 +166,19 @@ def working_tree_hash(root: Path) -> str: return hashlib.sha256("\n".join(lines).encode("utf-8")).hexdigest() +def _safe_working_tree_hash(root: Path) -> Optional[str]: + """``working_tree_hash`` that degrades to None — capture must never fail a build. + + A permission error or filesystem race during the walk/stat/read would otherwise + raise out of capture and abort the build before the archive is even created. + """ + try: + return working_tree_hash(root) + except Exception: + logger.warning("build-provenance: content hash failed; omitting", exc_info=True) + return None + + def capture_build_provenance( repo_path: Path, context_root: Path, content_root: Optional[Path] = None ) -> BuildProvenance: @@ -176,19 +188,18 @@ def capture_build_provenance( relative to the repo root (which agent, in a monorepo). ``content_root`` is the directory hashed — the *staged*, post-``.dockerignore`` tree that actually ships; it defaults to ``context_root`` when there is no separate staging dir. - The content hash is computed unless a clean commit identifies the build (so: - for a dirty tree or a non-git context, but not for a clean committed tree). + ``working_tree_hash`` is always computed; git coordinates anchor it to source + when available. """ timestamp = datetime.now(timezone.utc).isoformat() hash_root = content_root if content_root is not None else context_root + tree_hash = _safe_working_tree_hash(hash_root) + repo_root = _git(repo_path, "rev-parse", "--show-toplevel") if repo_root is None: - # No git at all — the content hash is the only identity available. - logger.info("build-provenance: %s is not a git work tree; hashing context", repo_path) - return BuildProvenance( - working_tree_hash=working_tree_hash(hash_root), - build_timestamp=timestamp, - ) + # No git — the content hash is the only identity available. + logger.info("build-provenance: %s is not a git work tree; content hash only", repo_path) + return BuildProvenance(working_tree_hash=tree_hash, build_timestamp=timestamp) repo_root_path = Path(repo_root) commit = _git(repo_root_path, "rev-parse", "HEAD") @@ -207,10 +218,8 @@ def capture_build_provenance( except ValueError: subpath = None - # Hash unless a clean commit identifies the build: dirty tree, or an unborn - # HEAD with no commit yet, both fall back to the content hash. + # `git status --porcelain` is empty (→ _git returns None) for a clean tree. dirty = _git(repo_root_path, "status", "--porcelain") is not None - tree_hash = working_tree_hash(hash_root) if (dirty or commit is None) else None return BuildProvenance( repo=remote, @@ -218,6 +227,7 @@ def capture_build_provenance( ref=ref, subpath=subpath, working_tree_hash=tree_hash, + dirty=dirty, author_name=author_name, author_email=author_email, build_timestamp=timestamp, diff --git a/tests/lib/test_build_provenance.py b/tests/lib/test_build_provenance.py index 2a765276c..b4c2442e4 100644 --- a/tests/lib/test_build_provenance.py +++ b/tests/lib/test_build_provenance.py @@ -130,26 +130,13 @@ def test_capture_clean_tree(tmp_path: Path) -> None: assert prov.repo == "github.com/scaleapi/demo" assert prov.ref == "main" assert prov.commit is not None and len(prov.commit) == 40 - assert prov.working_tree_hash is None - assert prov.is_clean_commit is True + assert prov.working_tree_hash is not None # always computed + assert prov.dirty is False assert prov.subpath is None assert prov.author_email == "dev@scale.com" -def test_capture_dirty_tracked_modification(tmp_path: Path) -> None: - repo = _init_repo(tmp_path / "repo") - _write(repo, "main.py", "print(1)") - _commit_all(repo) - _write(repo, "main.py", "print(2)") # modify tracked, do not commit - - prov = capture_build_provenance(repo, repo) - - assert prov.is_clean_commit is False - assert prov.working_tree_hash is not None - assert prov.commit is not None # commit still recorded alongside the hash - - -def test_capture_dirty_untracked_file_changes_hash(tmp_path: Path) -> None: +def test_capture_untracked_file_changes_hash(tmp_path: Path) -> None: repo = _init_repo(tmp_path / "repo") _write(repo, "main.py", "print(1)") _commit_all(repo) @@ -159,7 +146,7 @@ def test_capture_dirty_untracked_file_changes_hash(tmp_path: Path) -> None: # The stale-code guard: an untracked file is part of the build context, so it # must move the hash (a `git diff` of tracked files alone would miss it). - assert prov.is_clean_commit is False + assert prov.dirty is True assert prov.working_tree_hash == working_tree_hash(repo) assert working_tree_hash(repo) != _hash_without(repo, "scratch.py") @@ -214,7 +201,7 @@ def test_capture_no_remote(tmp_path: Path) -> None: assert prov.repo is None assert prov.commit is not None - assert prov.is_clean_commit is True # no remote, but a clean commit still anchors it + assert prov.working_tree_hash is not None # always computed def test_capture_non_git_dir(tmp_path: Path) -> None: @@ -226,12 +213,28 @@ def test_capture_non_git_dir(tmp_path: Path) -> None: assert prov.repo is None assert prov.commit is None assert prov.ref is None - # No commit to point to → the content hash is the identity. + # No commit → the content hash is the identity; dirtiness is undefined (no VCS). assert prov.working_tree_hash == working_tree_hash(plain) - assert prov.is_clean_commit is False + assert prov.dirty is None assert prov.build_timestamp is not None +def test_capture_never_raises_when_hash_fails(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + import agentex.lib.utils.build_provenance as bp + + plain = tmp_path / "plain" # non-git → would hash, which we force to fail + _write(plain, "main.py", "print(1)") + + def _boom(_root: Path) -> str: + raise OSError("permission denied") + + monkeypatch.setattr(bp, "working_tree_hash", _boom) + + prov = bp.capture_build_provenance(plain, plain) # must not raise + + assert prov.working_tree_hash is None + + def test_capture_monorepo_subpath(tmp_path: Path) -> None: repo = _init_repo(tmp_path / "repo") _write(repo, "agents/foo/main.py", "print(1)") From 923a110d410d1a0a1b5175559a1d3abaaee75d3e Mon Sep 17 00:00:00 2001 From: Max Parke Date: Mon, 29 Jun 2026 22:32:37 -0400 Subject: [PATCH 3/3] refactor(lib): drop the build-info.json runtime sink (Greptile) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Greptile (T-Rex repro) showed build-info.json was written to the archive root, which the templates' Dockerfiles don't COPY and the runtime locate_build_info_path() doesn't read — so it never reached the image and the registration_metadata sink stayed empty. Beyond the placement bug, the sink is redundant: AgentexCloudDeploy.build_id is an FK to AgentexCloudBuild, so a deployment's source provenance derives from the build record (the source_* columns this work adds, Surface C) over that join — the same Build->Deploy edge lineage already traverses. No need to denormalize provenance onto registration_metadata/DeploymentHistory (which has had no producer since its read path landed 2025-09, so its git fields have never been populated). #454 now ships only the shared capture util (agentex.lib.build_provenance) plus a deterministic build-archive ordering. Provenance is delivered via the build-record sink; the runtime sink can be revived (correctly placed) if a real consumer for deployment-history provenance ever appears. Co-Authored-By: Claude Opus 4.8 --- .../lib/cli/handlers/agent_handlers.py | 24 +------------------ tests/lib/cli/test_agent_handlers.py | 19 --------------- uv.lock | 4 ++-- 3 files changed, 3 insertions(+), 44 deletions(-) diff --git a/src/agentex/lib/cli/handlers/agent_handlers.py b/src/agentex/lib/cli/handlers/agent_handlers.py index d390e967f..1f2ccc7ef 100644 --- a/src/agentex/lib/cli/handlers/agent_handlers.py +++ b/src/agentex/lib/cli/handlers/agent_handlers.py @@ -1,6 +1,5 @@ from __future__ import annotations -import json from typing import NamedTuple from pathlib import Path @@ -9,15 +8,12 @@ from agentex.lib.cli.debug import DebugConfig from agentex.lib.utils.logging import make_logger -from agentex.lib.utils.build_provenance import BuildProvenance, capture_build_provenance from agentex.lib.cli.handlers.run_handlers import RunError, run_agent as _run_agent from agentex.lib.sdk.config.agent_manifest import BuildContextManager, load_agent_manifest, build_context_manager logger = make_logger(__name__) console = Console() -_BUILD_INFO_FILENAME = "build-info.json" - class DockerBuildError(Exception): """An error occurred during docker build""" @@ -32,7 +28,6 @@ class CloudBuildContext(NamedTuple): tag: str image_name: str build_context_size_kb: float - provenance: BuildProvenance def build_agent( @@ -266,24 +261,8 @@ def prepare_cloud_build_context( logger.info("Preparing build context...") with build_context_manager(agent_manifest, build_context_root) as build_context: - staged_root = Path(build_context.path) - # Capture source identity over the staged (post-.dockerignore) tree — the - # exact bytes that ship — then write build-info.json into it so it lands - # in the image for runtime registration. Capture runs before the write so - # the content hash never includes build-info.json itself. - provenance = capture_build_provenance( - repo_path=build_context_root, - context_root=build_context_root, - content_root=staged_root, - ) - (staged_root / _BUILD_INFO_FILENAME).write_text(json.dumps(provenance.build_info(), indent=2, sort_keys=True)) - logger.info( - f"Build provenance: commit={provenance.commit} ref={provenance.ref} " - f"working_tree_hash={provenance.working_tree_hash}" - ) - # Compress the prepared context using the static zipped method - with BuildContextManager.zipped(root_path=staged_root) as archive_buffer: + with BuildContextManager.zipped(root_path=build_context.path) as archive_buffer: archive_bytes = archive_buffer.read() build_context_size_kb = len(archive_bytes) / 1024 @@ -296,5 +275,4 @@ def prepare_cloud_build_context( tag=tag, image_name=image_name, build_context_size_kb=build_context_size_kb, - provenance=provenance, ) diff --git a/tests/lib/cli/test_agent_handlers.py b/tests/lib/cli/test_agent_handlers.py index 768f6a19c..73c29cfbb 100644 --- a/tests/lib/cli/test_agent_handlers.py +++ b/tests/lib/cli/test_agent_handlers.py @@ -2,9 +2,7 @@ from __future__ import annotations -import io import os -import json import tarfile import tempfile from pathlib import Path @@ -147,23 +145,6 @@ def test_prepare_cloud_build_context_returns_cloud_build_context( assert len(result.archive_bytes) > 0 assert result.build_context_size_kb > 0 - def test_prepare_cloud_build_context_writes_build_info(self, temp_agent_dir: Path): - """build-info.json ships in the archive and matches the captured provenance.""" - manifest_path = str(temp_agent_dir / "manifest.yaml") - - result = prepare_cloud_build_context(manifest_path=manifest_path) - - # Non-git temp dir → the content hash is the identity, no commit. - assert result.provenance.commit is None - assert result.provenance.working_tree_hash is not None - - with tarfile.open(fileobj=io.BytesIO(result.archive_bytes), mode="r:gz") as archive: - build_info_name = next(n for n in archive.getnames() if n.endswith("build-info.json")) - member = archive.extractfile(build_info_name) - assert member is not None - shipped = json.loads(member.read()) - assert shipped == result.provenance.build_info() - def test_prepare_cloud_build_context_with_tag_override(self, temp_agent_dir: Path): """Test that tag parameter overrides manifest tag.""" manifest_path = str(temp_agent_dir / "manifest.yaml") diff --git a/uv.lock b/uv.lock index 8a41ba29c..131ad3257 100644 --- a/uv.lock +++ b/uv.lock @@ -15,7 +15,7 @@ members = [ [[package]] name = "agentex-client" -version = "0.13.0" +version = "0.16.2" source = { editable = "." } dependencies = [ { name = "anyio" }, @@ -91,7 +91,7 @@ dev = [ [[package]] name = "agentex-sdk" -version = "0.13.0" +version = "0.16.2" source = { editable = "adk" } dependencies = [ { name = "agentex-client" },