Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ strands-agents/
│ │ │ ├── core/ # Base classes, actions, context
│ │ │ └── handlers/ # Handler implementations (e.g., LLM)
│ │ └── skills/ # AgentSkills.io integration (Skill, AgentSkills)
│ │ └── _url_loader.py # Remote URL loading, git clone, caching
│ │
│ ├── experimental/ # Experimental features (API may change)
│ │ ├── agent_config.py # Experimental agent config
Expand Down
241 changes: 241 additions & 0 deletions src/strands/vended_plugins/skills/_url_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
"""Utilities for loading skills from remote Git repository URLs.

This module provides functions to detect URL-type skill sources, parse
optional version references, clone repositories with shallow depth, and
manage a local cache of cloned skill repositories.
"""

from __future__ import annotations

import hashlib
import logging
import os
import re
import shutil
import subprocess
import tempfile
from pathlib import Path

logger = logging.getLogger(__name__)

# Patterns that indicate a string is a URL rather than a local path.
# NOTE: http:// is intentionally excluded — plaintext HTTP exposes users to
# man-in-the-middle attacks where malicious code could be injected into a
# cloned skill. Only encrypted transports are supported.
_URL_PREFIXES = ("https://", "git@", "ssh://")

# Regex to strip .git suffix from URLs before ref parsing
_GIT_SUFFIX = re.compile(r"\.git$")

# Matches GitHub /tree/<ref> or /tree/<ref>/<path> (also /blob/)
# e.g. /owner/repo/tree/main/skills/my-skill -> groups: (/owner/repo, main, skills/my-skill)
_GITHUB_TREE_PATTERN = re.compile(r"^(/[^/]+/[^/]+)/(?:tree|blob)/([^/]+)(?:/(.+?))?/?$")


def _default_cache_dir() -> Path:
"""Return the default cache directory, respecting ``XDG_CACHE_HOME``.

Evaluated lazily (not at import time) so that ``Path.home()`` is not
called in environments where ``HOME`` may be unset (e.g. containers).
"""
xdg = os.environ.get("XDG_CACHE_HOME")
base = Path(xdg) if xdg else Path.home() / ".cache"
return base / "strands" / "skills"


def is_url(source: str) -> bool:
"""Check whether a skill source string looks like a remote URL.

Args:
source: The skill source string to check.

Returns:
True if the source appears to be a URL.
"""
return any(source.startswith(prefix) for prefix in _URL_PREFIXES)


def parse_url_ref(url: str) -> tuple[str, str | None, str | None]:
"""Parse a skill URL into a clone URL, optional Git ref, and optional subpath.

Supports an ``@ref`` suffix for specifying a branch, tag, or commit::

https://github.com/org/skill-repo@v1.0.0 -> (https://github.com/org/skill-repo, v1.0.0, None)
https://github.com/org/skill-repo -> (https://github.com/org/skill-repo, None, None)

Also supports GitHub web URLs with ``/tree/<ref>/path`` ::

https://github.com/org/repo/tree/main/skills/my-skill
-> (https://github.com/org/repo, main, skills/my-skill)

Args:
url: The skill URL, optionally with an ``@ref`` suffix or ``/tree/`` path.

Returns:
Tuple of (clone_url, ref_or_none, subpath_or_none).
"""
if url.startswith(("https://", "http://", "ssh://")):
# Find the path portion after the host
scheme_end = url.index("//") + 2
host_end = url.find("/", scheme_end)
if host_end == -1:
return url, None, None

path_part = url[host_end:]

# Handle GitHub /tree/<ref>/path and /blob/<ref>/path URLs
tree_match = _GITHUB_TREE_PATTERN.match(path_part)
if tree_match:
owner_repo = tree_match.group(1)
ref = tree_match.group(2)
subpath = tree_match.group(3) or None
clone_url = url[:host_end] + owner_repo
return clone_url, ref, subpath

# Strip .git suffix before looking for @ref so that
# "repo.git@v1" is handled correctly
clean_path = _GIT_SUFFIX.sub("", path_part)
had_git_suffix = clean_path != path_part

if "@" in clean_path:
at_idx = clean_path.rfind("@")
ref = clean_path[at_idx + 1 :]
base_path = clean_path[:at_idx]
if had_git_suffix:
base_path += ".git"
return url[:host_end] + base_path, ref, None

return url, None, None

if url.startswith("git@"):
# SSH format: git@host:owner/repo.git@ref
# The first @ is part of the SSH URL format.
first_at = url.index("@")
rest = url[first_at + 1 :]

clean_rest = _GIT_SUFFIX.sub("", rest)
had_git_suffix = clean_rest != rest

if "@" in clean_rest:
at_idx = clean_rest.rfind("@")
ref = clean_rest[at_idx + 1 :]
base_rest = clean_rest[:at_idx]
if had_git_suffix:
base_rest += ".git"
return url[: first_at + 1] + base_rest, ref, None

return url, None, None

return url, None, None


def cache_key(url: str, ref: str | None) -> str:
"""Generate a deterministic cache directory name from a URL and ref.

Args:
url: The clone URL.
ref: The optional Git ref.

Returns:
A short hex digest suitable for use as a directory name.
"""
key_input = f"{url}@{ref}" if ref else url
return hashlib.sha256(key_input.encode()).hexdigest()[:16]


def clone_skill_repo(
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue: There is no mechanism to refresh a cached clone. When no ref is specified, the default branch is cloned. If the remote repo is updated, the user gets stale content indefinitely with no obvious way to refresh (other than manually deleting ~/.cache/strands/skills/).

Suggestion: Consider adding a force_refresh: bool = False parameter (or documenting the cache invalidation story clearly). At minimum, document in the docstring that users should delete the cache directory or pin a ref for reproducibility. This is especially important since the cache key is a hash — users can't easily find and delete the right directory.

url: str,
*,
ref: str | None = None,
subpath: str | None = None,
cache_dir: Path | None = None,
force_refresh: bool = False,
) -> Path:
"""Clone a skill repository to a local cache directory.

Uses ``git clone --depth 1`` for efficiency. If a ``ref`` is provided it
is passed as ``--branch`` (works for branches and tags). Repositories are
cached by a hash of (url, ref) so repeated loads are instant.

If ``subpath`` is provided, the returned path points to that subdirectory
within the cloned repository (useful for mono-repos containing skills in
nested directories).

**Cache behaviour**: Cloned repos are cached at
``$XDG_CACHE_HOME/strands/skills/`` (or ``~/.cache/strands/skills/``).
When no ``ref`` is pinned the default branch is cached; pass
``force_refresh=True`` to re-clone, or pin a specific tag/branch for
reproducibility.

Args:
url: The Git clone URL.
ref: Optional branch or tag to check out.
subpath: Optional path within the repo to return (e.g. ``skills/my-skill``).
cache_dir: Override the default cache directory.
force_refresh: If True, delete any cached clone and re-fetch from the
remote. Useful for unpinned refs that may have been updated.

Returns:
Path to the cloned repository root, or to ``subpath`` within it.

Raises:
RuntimeError: If the clone fails or ``git`` is not installed.
"""
cache_dir = cache_dir or _default_cache_dir()
cache_dir.mkdir(parents=True, exist_ok=True)

key = cache_key(url, ref)
target = cache_dir / key

if force_refresh and target.exists():
logger.info("url=<%s>, ref=<%s> | force-refreshing cached skill", url, ref)
shutil.rmtree(target)

if not target.exists():
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue: Race condition — if two threads/processes call clone_skill_repo concurrently with the same URL, both pass the not target.exists() check and attempt to clone into the same directory. One will fail or produce a corrupt state.

Suggestion: Use an atomic pattern: clone to a temporary directory first, then os.rename() (atomic on the same filesystem) to the target path. If the target already exists by the time rename happens, just delete the temp clone. Something like:

import tempfile

with tempfile.TemporaryDirectory(dir=cache_dir) as tmp:
    tmp_target = Path(tmp) / "repo"
    # ... clone into tmp_target ...
    try:
        tmp_target.rename(target)
    except OSError:
        # Another process won the race — that's fine, use their clone
        pass

logger.info("url=<%s>, ref=<%s> | cloning skill repository", url, ref)

cmd: list[str] = ["git", "clone", "--depth", "1"]
if ref:
cmd.extend(["--branch", ref])

# Clone into a temporary directory first, then atomically rename to
# the target path. This prevents a race condition where two
# concurrent processes both pass the ``not target.exists()`` check
# and attempt to clone into the same directory.
tmp_dir = tempfile.mkdtemp(dir=cache_dir, prefix=".tmp-clone-")
tmp_target = Path(tmp_dir) / "repo"

try:
cmd.extend([url, str(tmp_target)])
subprocess.run( # noqa: S603
cmd,
check=True,
capture_output=True,
text=True,
timeout=120,
)
try:
tmp_target.rename(target)
except OSError:
# Another process completed the clone first — use theirs
logger.debug("url=<%s> | another process already cached this skill, using existing", url)
except subprocess.CalledProcessError as e:
raise RuntimeError(
f"url=<{url}>, ref=<{ref}> | failed to clone skill repository: {e.stderr.strip()}"
) from e
except FileNotFoundError as e:
raise RuntimeError("git is required to load skills from URLs but was not found on PATH") from e
finally:
# Always clean up the temp directory
if Path(tmp_dir).exists():
shutil.rmtree(tmp_dir, ignore_errors=True)
else:
logger.debug("url=<%s>, ref=<%s> | using cached skill at %s", url, ref, target)

result = target / subpath if subpath else target

if subpath and not result.is_dir():
raise RuntimeError(f"url=<{url}>, subpath=<{subpath}> | subdirectory does not exist in cloned repository")

logger.debug("url=<%s>, ref=<%s> | resolved to %s", url, ref, result)
return result
20 changes: 19 additions & 1 deletion src/strands/vended_plugins/skills/agent_skills.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def __init__(
state_key: str = _DEFAULT_STATE_KEY,
max_resource_files: int = _DEFAULT_MAX_RESOURCE_FILES,
strict: bool = False,
cache_dir: Path | None = None,
) -> None:
"""Initialize the AgentSkills plugin.

Expand All @@ -86,11 +87,16 @@ def __init__(
- A ``str`` or ``Path`` to a skill directory (containing SKILL.md)
- A ``str`` or ``Path`` to a parent directory (containing skill subdirectories)
- A ``Skill`` dataclass instance
- A remote Git URL (``https://``, ``git@``, or ``ssh://``)
with optional ``@ref`` suffix for branch/tag pinning
state_key: Key used to store plugin state in ``agent.state``.
max_resource_files: Maximum number of resource files to list in skill responses.
strict: If True, raise on skill validation issues. If False (default), warn and load anyway.
cache_dir: Directory for caching cloned skill repositories.
Defaults to ``~/.cache/strands/skills/``.
"""
self._strict = strict
self._cache_dir = cache_dir
self._skills: dict[str, Skill] = self._resolve_skills(_normalize_sources(skills))
self._state_key = state_key
self._max_resource_files = max_resource_files
Expand Down Expand Up @@ -284,21 +290,33 @@ def _resolve_skills(self, sources: list[SkillSource]) -> dict[str, Skill]:
"""Resolve a list of skill sources into Skill instances.

Each source can be a Skill instance, a path to a skill directory,
or a path to a parent directory containing multiple skills.
a path to a parent directory containing multiple skills, or a remote
Git URL.

Args:
sources: List of skill sources to resolve.

Returns:
Dict mapping skill names to Skill instances.
"""
from ._url_loader import is_url

resolved: dict[str, Skill] = {}

for source in sources:
if isinstance(source, Skill):
if source.name in resolved:
logger.warning("name=<%s> | duplicate skill name, overwriting previous skill", source.name)
resolved[source.name] = source
elif isinstance(source, str) and is_url(source):
try:
url_skills = Skill.from_url(source, cache_dir=self._cache_dir, strict=self._strict)
for skill in url_skills:
if skill.name in resolved:
logger.warning("name=<%s> | duplicate skill name, overwriting previous skill", skill.name)
resolved[skill.name] = skill
except (RuntimeError, ValueError) as e:
logger.warning("url=<%s> | failed to load skill from URL: %s", source, e)
else:
path = Path(source).resolve()
if not path.exists():
Expand Down
64 changes: 64 additions & 0 deletions src/strands/vended_plugins/skills/skill.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,70 @@ def from_content(cls, content: str, *, strict: bool = False) -> Skill:

return _build_skill_from_frontmatter(frontmatter, body)

@classmethod
def from_url(
cls,
url: str,
*,
cache_dir: Path | None = None,
force_refresh: bool = False,
strict: bool = False,
) -> list[Skill]:
"""Load skill(s) from a remote Git repository URL.

Clones the repository (or uses a cached copy) and then loads skills
using the standard filesystem methods. If the repository root contains
a ``SKILL.md`` file it is treated as a single skill; otherwise it is
scanned for skill subdirectories.

Supports an optional ``@ref`` suffix for branch or tag pinning::

skills = Skill.from_url("https://github.com/org/my-skill@v1.0.0")

Also supports GitHub web URLs pointing to subdirectories::

skills = Skill.from_url("https://github.com/org/repo/tree/main/skills/my-skill")

Args:
url: A Git-cloneable URL, optionally with an ``@ref`` suffix or
a GitHub ``/tree/<ref>/path`` URL. Only ``https://``,
``ssh://``, and ``git@`` schemes are supported; plaintext
``http://`` is rejected for security.
cache_dir: Override the default cache directory
(``$XDG_CACHE_HOME/strands/skills/`` or
``~/.cache/strands/skills/``).
force_refresh: If True, discard any cached clone and re-fetch from
the remote. Useful when loading unpinned refs (e.g. ``main``)
that may have been updated upstream.
strict: If True, raise on any validation issue. If False (default),
warn and load anyway.

Returns:
List of Skill instances loaded from the repository.

Raises:
ValueError: If ``url`` is not a recognised remote URL scheme.
RuntimeError: If the repository cannot be cloned or ``git`` is not
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue: The Raises section documents RuntimeError but the method also raises ValueError (via is_url check on line 377). Additionally, calling Skill.from_url() with a non-URL raises ValueError, but using the same string through AgentSkills(skills=["./path"]) treats it as a local path. This inconsistency could confuse users.

Suggestion: Add ValueError to the Raises section in the docstring.

available.
"""
from ._url_loader import clone_skill_repo, is_url, parse_url_ref

if not is_url(url):
raise ValueError(f"url=<{url}> | not a valid remote URL")
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue: The from_url ValueError uses an f-string in the exception message: f"url=<{url}> | not a valid remote URL". This is consistent with the structured logging style used elsewhere, but the AGENTS.md specifically says "Don't use f-strings in logging calls." Since this is an exception message (not a logging call), this is acceptable — but the logging calls throughout the module correctly use %s format, which is good.

However, the RuntimeError raised in clone_skill_repo at line 189 also uses an f-string: f"url=<{url}>, ref=<{ref}> | failed to clone...". This is fine since it's an exception message, not a log statement.


clean_url, ref, subpath = parse_url_ref(url)
repo_path = clone_skill_repo(
clean_url, ref=ref, subpath=subpath, cache_dir=cache_dir, force_refresh=force_refresh
)

# If the repo root is itself a skill, load it directly
has_skill_md = (repo_path / "SKILL.md").is_file() or (repo_path / "skill.md").is_file()

if has_skill_md:
return [cls.from_file(repo_path, strict=strict)]
else:
return cls.from_directory(repo_path, strict=strict)

@classmethod
def from_directory(cls, skills_dir: str | Path, *, strict: bool = False) -> list[Skill]:
"""Load all skills from a parent directory containing skill subdirectories.
Expand Down
Loading
Loading