diff --git a/src/agentex/lib/sdk/config/agent_manifest.py b/src/agentex/lib/sdk/config/agent_manifest.py index fd743e635..c2fe03052 100644 --- a/src/agentex/lib/sdk/config/agent_manifest.py +++ b/src/agentex/lib/sdk/config/agent_manifest.py @@ -24,6 +24,7 @@ from agentex.lib.utils.io import load_yaml_file from agentex.lib.utils.logging import make_logger from agentex.config.agent_manifest import AgentManifest # noqa: F401 +from agentex.lib.utils.build_provenance import iter_context_files logger = make_logger(__name__) @@ -189,12 +190,11 @@ def zipped(root_path: Path | None = None) -> Iterator[IO[bytes]]: tar_buffer = io.BytesIO() + # Sorted, relpath-stable enumeration (shared with the content hash) so the + # archive's member order is deterministic across machines. with tarfile.open(fileobj=tar_buffer, mode="w:gz") as tar_file: - for path in Path(root_path).rglob( - "*" - ): # Recursively add files to the tar.gz - if path.is_file(): # Ensure that we're only adding files - tar_file.add(path, arcname=path.relative_to(root_path)) + for path in iter_context_files(Path(root_path)): + tar_file.add(path, arcname=path.relative_to(root_path)) tar_buffer.seek(0) # Reset the buffer position to the beginning yield tar_buffer diff --git a/src/agentex/lib/utils/build_provenance.py b/src/agentex/lib/utils/build_provenance.py new file mode 100644 index 000000000..af17b03c9 --- /dev/null +++ b/src/agentex/lib/utils/build_provenance.py @@ -0,0 +1,234 @@ +"""Client-attested build provenance capture (AGX1-418). + +The single producer of source identity for agent builds: git coordinates plus a +deterministic content hash of the build context. Every build path (CLI, sgpctl, +CI) imports this so capture logic and the ``working_tree_hash`` definition live +in exactly one place. Capture is best-effort — a missing/odd git state degrades +to nulls and never raises into a build. +""" + +from __future__ import annotations + +import os +import stat +import hashlib +import subprocess +from typing import Optional +from pathlib import Path +from datetime import datetime, timezone +from dataclasses import dataclass + +from agentex.lib.utils.logging import make_logger + +logger = make_logger(__name__) + +_GIT_TIMEOUT_S = 5 +_HASH_CHUNK_BYTES = 1 << 20 + + +@dataclass(frozen=True) +class BuildProvenance: + """Source identity for one build; every field degrades to ``None``. + + ``working_tree_hash`` is the deterministic content hash of the build context + and is always computed — it identifies the exact bytes that shipped, + independent of git state. ``commit`` (+ ``ref`` / ``repo``) anchor those bytes + to source, and ``dirty`` records whether the work tree had uncommitted changes + at build time (``None`` outside a git work tree). + """ + + repo: Optional[str] = None + commit: Optional[str] = None + ref: Optional[str] = None + subpath: Optional[str] = None + working_tree_hash: Optional[str] = None + dirty: Optional[bool] = None + author_name: Optional[str] = None + author_email: Optional[str] = None + build_timestamp: Optional[str] = None + + def source_fields(self) -> dict[str, object]: + """The ``source_*`` form fields for the cloud-build upload (None omitted).""" + fields = { + "source_repo": self.repo, + "source_commit": self.commit, + "source_ref": self.ref, + "source_subpath": self.subpath, + "working_tree_hash": self.working_tree_hash, + "source_dirty": self.dirty, + } + return {key: value for key, value in fields.items() if value is not None} + + def build_info(self) -> dict[str, object]: + """The ``build-info.json`` payload (runtime ``registration_metadata``). + + Overlapping keys match the server's ``DeploymentHistory`` type + (``commit_hash`` / ``branch_name`` / ``author_*`` / ``build_timestamp``), + which is populated from ``registration_metadata``; the rest are the + provenance-specific coordinates. + """ + info = { + "repo": self.repo, + "commit_hash": self.commit, + "branch_name": self.ref, + "subpath": self.subpath, + "working_tree_hash": self.working_tree_hash, + "dirty": self.dirty, + "author_name": self.author_name, + "author_email": self.author_email, + "build_timestamp": self.build_timestamp, + } + return {key: value for key, value in info.items() if value is not None} + + +def _git(repo_root: Path, *args: str) -> Optional[str]: + """Run a git command under ``repo_root``; return stripped stdout or None.""" + try: + proc = subprocess.run( + ("git", "-C", str(repo_root), *args), + capture_output=True, + text=True, + timeout=_GIT_TIMEOUT_S, + check=False, + ) + except (OSError, subprocess.SubprocessError): + return None + if proc.returncode != 0: + return None + return proc.stdout.strip() or None + + +def normalize_remote(url: Optional[str]) -> Optional[str]: + """Canonicalize a git remote to ``host/path`` — credentials and scheme stripped. + + ``git@github.com:org/repo.git`` and ``https://x:tok@github.com/org/repo.git`` + both normalize to ``github.com/org/repo``. Host is lowercased; path casing is + preserved (repo paths can be case-significant). + """ + if not url: + return None + candidate = url.strip() + # scp-like syntax: git@host:org/repo(.git) — no scheme, host/path split on ':' + if "://" not in candidate and ":" in candidate and "/" not in candidate.split(":", 1)[0]: + candidate = candidate.split("@", 1)[-1].replace(":", "/", 1) + else: + if "://" in candidate: + candidate = candidate.split("://", 1)[1] + candidate = candidate.split("@", 1)[-1] + if candidate.endswith(".git"): + candidate = candidate[: -len(".git")] + candidate = candidate.strip("/") + if not candidate: + return None + host, slash, path = candidate.partition("/") + return f"{host.lower()}{slash}{path}" + + +def _sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with open(path, "rb") as handle: + while chunk := handle.read(_HASH_CHUNK_BYTES): + digest.update(chunk) + return digest.hexdigest() + + +def iter_context_files(root: Path) -> list[Path]: + """Files (and symlinks) under ``root``, sorted by POSIX relpath. + + The canonical, order-stable enumeration shared by the content hash and the + archive packer so the two can never drift on which files they cover. + """ + return sorted( + (path for path in root.rglob("*") if path.is_symlink() or path.is_file()), + key=lambda path: path.relative_to(root).as_posix(), + ) + + +def working_tree_hash(root: Path) -> str: + """Deterministic content hash of the build context at ``root``. + + sha256 over the sorted ``(relpath, normalized mode, content digest)`` of every + file — the build *inputs*, not the tarball (tar/gzip framing is + non-deterministic and would defeat dedupe). Mode is normalized to the + executable bit; symlinks hash their target string, not the resolved content. + """ + lines: list[str] = [] + for path in iter_context_files(root): + relpath = path.relative_to(root).as_posix() + if path.is_symlink(): + mode = "120000" + content_digest = hashlib.sha256(os.readlink(path).encode("utf-8")).hexdigest() + else: + executable = bool(path.stat().st_mode & stat.S_IXUSR) + mode = "100755" if executable else "100644" + content_digest = _sha256_file(path) + lines.append(f"{relpath}\x00{mode}\x00{content_digest}") + return hashlib.sha256("\n".join(lines).encode("utf-8")).hexdigest() + + +def _safe_working_tree_hash(root: Path) -> Optional[str]: + """``working_tree_hash`` that degrades to None — capture must never fail a build. + + A permission error or filesystem race during the walk/stat/read would otherwise + raise out of capture and abort the build before the archive is even created. + """ + try: + return working_tree_hash(root) + except Exception: + logger.warning("build-provenance: content hash failed; omitting", exc_info=True) + return None + + +def capture_build_provenance( + repo_path: Path, context_root: Path, content_root: Optional[Path] = None +) -> BuildProvenance: + """Capture source identity for a build of ``context_root``. + + ``repo_path`` is where git is interrogated and ``subpath`` is ``context_root`` + relative to the repo root (which agent, in a monorepo). ``content_root`` is + the directory hashed — the *staged*, post-``.dockerignore`` tree that actually + ships; it defaults to ``context_root`` when there is no separate staging dir. + ``working_tree_hash`` is always computed; git coordinates anchor it to source + when available. + """ + timestamp = datetime.now(timezone.utc).isoformat() + hash_root = content_root if content_root is not None else context_root + tree_hash = _safe_working_tree_hash(hash_root) + + repo_root = _git(repo_path, "rev-parse", "--show-toplevel") + if repo_root is None: + # No git — the content hash is the only identity available. + logger.info("build-provenance: %s is not a git work tree; content hash only", repo_path) + return BuildProvenance(working_tree_hash=tree_hash, build_timestamp=timestamp) + + repo_root_path = Path(repo_root) + commit = _git(repo_root_path, "rev-parse", "HEAD") + # symbolic-ref fails on a detached HEAD (→ None); fall back to an exact tag. + ref = _git(repo_root_path, "symbolic-ref", "--short", "HEAD") or _git( + repo_root_path, "describe", "--tags", "--exact-match" + ) + remote = normalize_remote(_git(repo_root_path, "remote", "get-url", "origin")) + author_name = _git(repo_root_path, "log", "-1", "--format=%an") + author_email = _git(repo_root_path, "log", "-1", "--format=%ae") + + subpath: Optional[str] = None + try: + relative = context_root.resolve().relative_to(repo_root_path.resolve()).as_posix() + subpath = relative if relative != "." else None + except ValueError: + subpath = None + + # `git status --porcelain` is empty (→ _git returns None) for a clean tree. + dirty = _git(repo_root_path, "status", "--porcelain") is not None + + return BuildProvenance( + repo=remote, + commit=commit, + ref=ref, + subpath=subpath, + working_tree_hash=tree_hash, + dirty=dirty, + author_name=author_name, + author_email=author_email, + build_timestamp=timestamp, + ) diff --git a/tests/lib/test_build_provenance.py b/tests/lib/test_build_provenance.py new file mode 100644 index 000000000..b4c2442e4 --- /dev/null +++ b/tests/lib/test_build_provenance.py @@ -0,0 +1,245 @@ +from __future__ import annotations + +import subprocess +from pathlib import Path + +import pytest + +from agentex.lib.utils.build_provenance import ( + normalize_remote, + working_tree_hash, + iter_context_files, + capture_build_provenance, +) + + +def _git(repo: Path, *args: str) -> None: + subprocess.run(("git", "-C", str(repo), *args), check=True, capture_output=True, text=True) + + +def _init_repo(path: Path, *, remote: str | None = "git@github.com:scaleapi/demo.git") -> Path: + path.mkdir(parents=True, exist_ok=True) + _git(path, "init", "-q") + _git(path, "config", "user.email", "dev@scale.com") + _git(path, "config", "user.name", "Dev") + _git(path, "config", "commit.gpgsign", "false") + if remote: + _git(path, "remote", "add", "origin", remote) + return path + + +def _commit_all(path: Path, message: str = "init") -> None: + _git(path, "add", "-A") + _git(path, "commit", "-q", "-m", message) + _git(path, "branch", "-M", "main") + + +def _write(root: Path, rel: str, content: str = "x") -> None: + target = root / rel + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(content) + + +# --- normalize_remote --------------------------------------------------------- + + +@pytest.mark.parametrize( + ("raw", "expected"), + [ + ("git@github.com:scaleapi/Repo.git", "github.com/scaleapi/Repo"), + ("https://github.com/scaleapi/Repo.git", "github.com/scaleapi/Repo"), + ("https://x-token:secret@GitHub.com/scaleapi/Repo", "github.com/scaleapi/Repo"), + ("ssh://git@gitlab.com/group/sub/proj.git", "gitlab.com/group/sub/proj"), + ("", None), + (None, None), + ], +) +def test_normalize_remote(raw: str | None, expected: str | None) -> None: + assert normalize_remote(raw) == expected + + +# --- working_tree_hash -------------------------------------------------------- + + +def test_hash_is_order_independent(tmp_path: Path) -> None: + first = tmp_path / "a" + second = tmp_path / "b" + for rel in ("z.txt", "a/b.txt", "m.txt"): + _write(first, rel, rel) + # Same content, different creation order. + for rel in ("m.txt", "z.txt", "a/b.txt"): + _write(second, rel, rel) + assert working_tree_hash(first) == working_tree_hash(second) + + +def test_hash_changes_on_one_byte(tmp_path: Path) -> None: + root = tmp_path / "ctx" + _write(root, "f.txt", "hello") + before = working_tree_hash(root) + _write(root, "f.txt", "hellp") + assert working_tree_hash(root) != before + + +def test_hash_changes_when_file_added(tmp_path: Path) -> None: + root = tmp_path / "ctx" + _write(root, "f.txt", "hello") + before = working_tree_hash(root) + _write(root, "g.txt", "new") + assert working_tree_hash(root) != before + + +def test_hash_changes_on_executable_bit(tmp_path: Path) -> None: + root = tmp_path / "ctx" + script = root / "run.sh" + _write(root, "run.sh", "#!/bin/sh\n") + before = working_tree_hash(root) + script.chmod(0o755) + assert working_tree_hash(root) != before + + +def test_symlink_hashes_target_not_resolved_content(tmp_path: Path) -> None: + root = tmp_path / "ctx" + root.mkdir() + # Dangling symlinks: distinct hashes prove the target string is hashed, not + # resolved content (resolving would raise). + (root / "link").symlink_to("points/to/a") + hash_a = working_tree_hash(root) + (root / "link").unlink() + (root / "link").symlink_to("points/to/b") + assert working_tree_hash(root) != hash_a + + +def test_iter_context_files_skips_directories(tmp_path: Path) -> None: + root = tmp_path / "ctx" + _write(root, "pkg/mod.py", "x") + _write(root, "top.txt", "y") + rels = [path.relative_to(root).as_posix() for path in iter_context_files(root)] + assert rels == ["pkg/mod.py", "top.txt"] + + +# --- capture_build_provenance ------------------------------------------------- + + +def test_capture_clean_tree(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + _write(repo, "main.py", "print(1)") + _commit_all(repo) + + prov = capture_build_provenance(repo, repo) + + assert prov.repo == "github.com/scaleapi/demo" + assert prov.ref == "main" + assert prov.commit is not None and len(prov.commit) == 40 + assert prov.working_tree_hash is not None # always computed + assert prov.dirty is False + assert prov.subpath is None + assert prov.author_email == "dev@scale.com" + + +def test_capture_untracked_file_changes_hash(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + _write(repo, "main.py", "print(1)") + _commit_all(repo) + _write(repo, "scratch.py", "debug = True") # untracked + + prov = capture_build_provenance(repo, repo) + + # The stale-code guard: an untracked file is part of the build context, so it + # must move the hash (a `git diff` of tracked files alone would miss it). + assert prov.dirty is True + assert prov.working_tree_hash == working_tree_hash(repo) + assert working_tree_hash(repo) != _hash_without(repo, "scratch.py") + + +def _hash_without(repo: Path, rel: str) -> str: + removed = repo / rel + saved = removed.read_text() + removed.unlink() + try: + return working_tree_hash(repo) + finally: + removed.write_text(saved) + + +def test_capture_detached_head_has_no_ref(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + _write(repo, "main.py", "print(1)") + _commit_all(repo) + _write(repo, "main.py", "print(2)") + _git(repo, "add", "-A") + _git(repo, "commit", "-q", "-m", "second") + first = subprocess.run( + ("git", "-C", str(repo), "rev-list", "--max-parents=0", "HEAD"), + check=True, + capture_output=True, + text=True, + ).stdout.strip() + _git(repo, "checkout", "-q", first) + + prov = capture_build_provenance(repo, repo) + + assert prov.commit == first + assert prov.ref is None + + +def test_capture_detached_on_tag_uses_tag(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + _write(repo, "main.py", "print(1)") + _commit_all(repo) + _git(repo, "tag", "v1.2.3") + _git(repo, "checkout", "-q", "v1.2.3") + + assert capture_build_provenance(repo, repo).ref == "v1.2.3" + + +def test_capture_no_remote(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo", remote=None) + _write(repo, "main.py", "print(1)") + _commit_all(repo) + + prov = capture_build_provenance(repo, repo) + + assert prov.repo is None + assert prov.commit is not None + assert prov.working_tree_hash is not None # always computed + + +def test_capture_non_git_dir(tmp_path: Path) -> None: + plain = tmp_path / "plain" + _write(plain, "main.py", "print(1)") + + prov = capture_build_provenance(plain, plain) + + assert prov.repo is None + assert prov.commit is None + assert prov.ref is None + # No commit → the content hash is the identity; dirtiness is undefined (no VCS). + assert prov.working_tree_hash == working_tree_hash(plain) + assert prov.dirty is None + assert prov.build_timestamp is not None + + +def test_capture_never_raises_when_hash_fails(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + import agentex.lib.utils.build_provenance as bp + + plain = tmp_path / "plain" # non-git → would hash, which we force to fail + _write(plain, "main.py", "print(1)") + + def _boom(_root: Path) -> str: + raise OSError("permission denied") + + monkeypatch.setattr(bp, "working_tree_hash", _boom) + + prov = bp.capture_build_provenance(plain, plain) # must not raise + + assert prov.working_tree_hash is None + + +def test_capture_monorepo_subpath(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + _write(repo, "agents/foo/main.py", "print(1)") + _commit_all(repo) + + prov = capture_build_provenance(repo, repo / "agents" / "foo") + + assert prov.subpath == "agents/foo" diff --git a/uv.lock b/uv.lock index 8a41ba29c..131ad3257 100644 --- a/uv.lock +++ b/uv.lock @@ -15,7 +15,7 @@ members = [ [[package]] name = "agentex-client" -version = "0.13.0" +version = "0.16.2" source = { editable = "." } dependencies = [ { name = "anyio" }, @@ -91,7 +91,7 @@ dev = [ [[package]] name = "agentex-sdk" -version = "0.13.0" +version = "0.16.2" source = { editable = "adk" } dependencies = [ { name = "agentex-client" },