Skip to content

Commit

Permalink
Track custom metadata in artifacts.yaml (#315)
Browse files Browse the repository at this point in the history
close #313 

- [x] implement
- [x] choose a better name
- [x] add to docs
  • Loading branch information
aguschin authored Dec 16, 2022
1 parent ff500db commit d07d11e
Show file tree
Hide file tree
Showing 10 changed files with 110 additions and 88 deletions.
43 changes: 20 additions & 23 deletions gto/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import re
from collections import OrderedDict
from typing import List, Optional, Union
from typing import Any, List, Optional, Union

from funcy import distinct
from git import Repo
Expand All @@ -14,7 +13,7 @@
VERSIONS_PER_STAGE,
VersionSort,
mark_artifact_unregistered,
shortcut_regexp,
parse_shortcut,
)
from gto.exceptions import NoRepo, NotImplementedInGTO, WrongArgs
from gto.git_utils import is_url_of_remote_repo
Expand Down Expand Up @@ -52,6 +51,7 @@ def annotate(
must_exist: bool = False,
labels: List[str] = None,
description: str = "",
custom: Any = None,
commit: bool = False,
push: bool = False,
branch: str = None,
Expand All @@ -67,6 +67,7 @@ def annotate(
must_exist=must_exist,
labels=labels,
description=description,
custom=custom,
update=True,
commit=commit,
push=push or is_url_of_remote_repo(repo),
Expand All @@ -92,10 +93,12 @@ def remove(
)


def describe(repo: Union[str, Repo], name: str, rev: str = None) -> Optional[Artifact]:
def describe(
repo: Union[str, Repo], name: str, rev: Optional[str] = None
) -> Optional[Artifact]:
"""Find enrichments for the artifact"""
match = re.search(shortcut_regexp, name)
if match:
shortcut = parse_shortcut(name)
if shortcut.shortcut:
if rev:
raise WrongArgs("Either specify revision or use naming shortcut.")
# clones a remote repo second time, can be optimized
Expand All @@ -104,24 +107,20 @@ def describe(repo: Union[str, Repo], name: str, rev: str = None) -> Optional[Art
return None
if len(versions) > 1:
raise NotImplementedInGTO(
"Ambigious naming shortcut: multiple variants found."
"Ambiguous naming shortcut: multiple variants found."
)
rev = versions[0]["commit_hexsha"]
name = match["artifact"]

if not is_url_of_remote_repo(repo) and rev is None:
repo_path = repo.working_dir if isinstance(repo, Repo) else repo
if not is_url_of_remote_repo(repo_path) and rev is None:
# read artifacts.yaml without using Git
artifact = (
FileIndexManager.from_path(
repo.working_dir if isinstance(repo, Repo) else repo
)
.get_index()
.state.get(name)
FileIndexManager.from_path(repo_path).get_index().state.get(shortcut.name)
)
else:
# read Git repo
with RepoIndexManager.from_repo(repo) as index:
artifact = index.get_commit_index(rev).state.get(name)
artifact = index.get_commit_index(rev).state.get(shortcut.name)
return artifact


Expand Down Expand Up @@ -448,16 +447,14 @@ def _show_versions( # pylint: disable=too-many-locals
def format_hexsha(hexsha):
return hexsha[:7] if truncate_hexsha else hexsha

match = re.search(shortcut_regexp, name)
if match:
name = match["artifact"]
shortcut = parse_shortcut(name)

with GitRegistry.from_repo(repo=repo) as reg:
if raw:
return reg.find_artifact(name).versions
return reg.find_artifact(shortcut.name).versions

artifact = reg.find_artifact(
name,
shortcut.name,
all_branches=all_branches,
all_commits=all_commits,
)
Expand All @@ -480,11 +477,11 @@ def format_hexsha(hexsha):
]
versions.append(v)

if match and (match["greatest"] or match["latest"]):
if shortcut.latest:
versions = versions[:1]
if match and match["stage"]:
if shortcut.stage:
versions = [
v for v in versions for a in v["stages"] if match["stage"] in a["stage"]
v for v in versions for a in v["stages"] if shortcut.stage == a["stage"]
]

if not table:
Expand Down
13 changes: 10 additions & 3 deletions gto/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,12 @@ def annotate(
),
label: List[str] = Option(None, "--label", help="Labels to add to artifact"),
description: str = Option("", "-d", "--description", help="Artifact description"),
custom: str = Option(
None,
"-c",
"--custom",
help="Custom metadata to add to artifact",
),
commit: bool = option_commit,
push: bool = option_push_commit,
branch: str = option_branch,
Expand All @@ -519,6 +525,7 @@ def annotate(
must_exist=must_exist,
labels=label,
description=description,
custom=custom,
commit=commit,
push=push,
branch=branch,
Expand Down Expand Up @@ -549,7 +556,7 @@ def describe(
type: Optional[bool] = option_show_type,
path: Optional[bool] = option_show_path,
description: Optional[bool] = option_show_description,
# custom: Optional[bool] = option_show_custom,
custom: Optional[bool] = option_show_custom,
):
"""Display enrichments for an artifact."""
assert (
Expand All @@ -561,8 +568,8 @@ def describe(
field = "path"
elif description:
field = "description"
# elif custom:
# field = "custom"
elif custom:
field = "custom"
else:
field = None

Expand Down
22 changes: 2 additions & 20 deletions gto/config.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,14 @@
# pylint: disable=no-self-use, no-self-argument, inconsistent-return-statements, invalid-name, import-outside-toplevel
import pathlib
import re
from pathlib import Path
from typing import Any, Dict, List, Optional

from pydantic import BaseModel, BaseSettings, validator
from pydantic.env_settings import InitSettingsSource
from ruamel.yaml import YAML

from gto.constants import name_regexp
from gto.exceptions import (
UnknownStage,
UnknownType,
ValidationError,
WrongConfig,
)
from gto.constants import assert_name_is_valid
from gto.exceptions import UnknownStage, UnknownType, WrongConfig
from gto.ext import EnrichmentReader, find_enrichment_types, find_enrichments

yaml = YAML(typ="safe", pure=True)
Expand All @@ -23,18 +17,6 @@
CONFIG_FILE_NAME = ".gto"


def check_name_is_valid(name):
return bool(re.search(name_regexp, name))


def assert_name_is_valid(name):
if not check_name_is_valid(name):
raise ValidationError(
f"Invalid value '{name}'. Only lowercase english letters, , '-', '/' are allowed."
"Value must be of len >= 2, must with a letter and end with a letter or a number."
)


class EnrichmentConfig(BaseModel):
type: str
config: Dict = {}
Expand Down
40 changes: 40 additions & 0 deletions gto/constants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import re
from enum import Enum
from typing import Optional

from pydantic import BaseModel

from gto.exceptions import ValidationError

COMMIT = "commit"
REF = "ref"
Expand Down Expand Up @@ -35,6 +40,41 @@ class Action(Enum):
f"^(?P<artifact>{name})(((#(?P<stage>{name})|@(?P<latest>latest)|@(?P<greatest>greatest))))$"
)


def check_name_is_valid(value):
return bool(re.search(name_regexp, value))


def assert_name_is_valid(value):
if not check_name_is_valid(value):
raise ValidationError(
f"Invalid value '{value}'. Only lowercase english letters, , '-', '/' are allowed."
"Value must be of len >= 2, must with a letter and end with a letter or a number."
)


class Shortcut(BaseModel):
name: str
stage: Optional[str] = None
latest: bool = False
shortcut: bool = False


def parse_shortcut(value):
match = re.search(shortcut_regexp, value)
if match:
value = match["artifact"]
if match["stage"]:
assert_name_is_valid(match["stage"])
assert_name_is_valid(value)
return Shortcut(
name=value,
stage=match["stage"] if match and match["stage"] else None,
latest=bool(match and (match["latest"] or match["greatest"])),
shortcut=bool(match),
)


# taken from https://stackoverflow.com/a/22312124/19782654, modified to include url without .git at the end
remote_git_repo_regex = re.compile(
r"((git|ssh|http(s)?)|(git@[\w\.]+))(:(//)?)([\w\.@\:/\-~]+)(/)?"
Expand Down
21 changes: 18 additions & 3 deletions gto/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from datetime import datetime
from functools import wraps
from pathlib import Path
from typing import IO, Dict, FrozenSet, Generator, List, Optional, Union
from typing import IO, Any, Dict, FrozenSet, Generator, List, Optional, Union

import git
from git import Repo
Expand Down Expand Up @@ -48,6 +48,7 @@ class Artifact(BaseModel):
virtual: bool = True
labels: List[str] = [] # TODO: allow key:value labels
description: str = ""
custom: Any = None


State = Dict[str, Artifact]
Expand Down Expand Up @@ -155,7 +156,7 @@ def write_state(self, path_or_file: Union[str, IO]):

@not_frozen
def add(
self, name, type, path, must_exist, labels, description, update
self, name, type, path, must_exist, labels, description, custom, update
) -> Artifact:
if name in self and not update:
raise ArtifactExists(name)
Expand All @@ -176,13 +177,15 @@ def add(
self.state[name].virtual = True
self.state[name].labels = sorted(set(self.state[name].labels).union(labels))
self.state[name].description = description or self.state[name].description
self.state[name].custom = custom or self.state[name].custom
else:
self.state[name] = Artifact(
type=type,
path=path,
virtual=not must_exist,
labels=labels,
description=description,
custom=custom,
)
self.state_is_valid(self.state)
return self.state[name]
Expand Down Expand Up @@ -211,7 +214,16 @@ def get_history(self) -> Dict[str, Index]:
raise NotImplementedError

def add(
self, name, type, path, must_exist, labels, description, update, stdout=False
self,
name,
type,
path,
must_exist,
labels,
description,
custom,
update,
stdout=False,
):
for arg in [name] + list(labels or []):
assert_name_is_valid(arg)
Expand All @@ -232,6 +244,7 @@ def add(
must_exist=must_exist,
labels=labels or [],
description=description,
custom=custom,
update=update,
)
self.update()
Expand Down Expand Up @@ -306,6 +319,7 @@ def add(
must_exist,
labels,
description,
custom,
update,
stdout=False,
commit=False,
Expand All @@ -325,6 +339,7 @@ def add(
must_exist=must_exist,
labels=labels,
description=description,
custom=custom,
update=update,
)

Expand Down
1 change: 1 addition & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def test_add_remove(empty_git_repo: Tuple[git.Repo, Callable]):
virtual=not must_exist,
labels=[],
description="",
custom=None,
),
[],
)
Expand Down
11 changes: 11 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,21 @@ def test_commands(showcase):
["-r", path, "rf#production", "--ref"],
"rf@v1.2.3\n",
)
_check_successful_cmd("describe", ["-r", path, "artifactnotexist"], "")
_check_successful_cmd("describe", ["-r", path, "rf#stagenotexist"], "")
_check_successful_cmd("describe", ["-r", path, "rf"], EXPECTED_DESCRIBE_OUTPUT)
_check_successful_cmd(
"describe", ["-r", path, "rf#production"], EXPECTED_DESCRIBE_OUTPUT
)
_check_successful_cmd(
"describe", ["-r", path, "rf@latest"], EXPECTED_DESCRIBE_OUTPUT
)
_check_successful_cmd(
"describe", ["-r", path, "rf", "--path"], "models/random-forest.pkl\n"
)
_check_successful_cmd("describe", ["-r", path, "rf", "--type"], "model\n")
_check_successful_cmd("describe", ["-r", path, "rf", "--description"], "")
_check_successful_cmd("describe", ["-r", path, "rf", "--custom"], "")
# None because of random order - fix this
_check_successful_cmd("stages", ["-r", path], None)
# None because of output randomness and complexity
Expand Down
Loading

0 comments on commit d07d11e

Please sign in to comment.