Skip to content

Commit

Permalink
WIP: New dataset API.
Browse files Browse the repository at this point in the history
  • Loading branch information
ChrisCummins committed Mar 20, 2021
1 parent b1760a6 commit 07538a2
Show file tree
Hide file tree
Showing 47 changed files with 1,595 additions and 861 deletions.
3 changes: 2 additions & 1 deletion compiler_gym/bin/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ py_binary(
srcs = ["datasets.py"],
visibility = ["//visibility:public"],
deps = [
"//compiler_gym/datasets:dataset",
":service",
"//compiler_gym/envs",
"//compiler_gym/util",
"//compiler_gym/util/flags:env_from_flags",
Expand Down Expand Up @@ -83,6 +83,7 @@ py_binary(
srcs = ["service.py"],
visibility = ["//visibility:public"],
deps = [
"//compiler_gym/datasets",
"//compiler_gym/envs",
"//compiler_gym/spaces",
"//compiler_gym/util",
Expand Down
105 changes: 6 additions & 99 deletions compiler_gym/bin/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,6 @@
+-------------------+--------------+-----------------+----------------+
These benchmarks are ready for use. Deactivate them using `--deactivate=<name>`.
+---------------------+-----------+-----------------+----------------+
| Inactive Datasets | License | #. Benchmarks | Size on disk |
+=====================+===========+=================+================+
| Total | | 0 | 0 Bytes |
+---------------------+-----------+-----------------+----------------+
These benchmarks may be activated using `--activate=<name>`.
+------------------------+---------------------------------+-----------------+----------------+
| Downloadable Dataset | License | #. Benchmarks | Size on disk |
+========================+=================================+=================+================+
| blas-v0 | BSD 3-Clause | 300 | 4.0 MB |
+------------------------+---------------------------------+-----------------+----------------+
| polybench-v0 | BSD 3-Clause | 27 | 162.6 kB |
+------------------------+---------------------------------+-----------------+----------------+
These benchmarks may be installed using `--download=<name> --activate=<name>`.
Downloading datasets
--------------------
Expand Down Expand Up @@ -131,23 +114,13 @@
A :code:`--delete_all` flag can be used to delete all of the locally installed
datasets.
"""
import os
import sys
from pathlib import Path
from typing import Tuple

import humanize
from absl import app, flags

from compiler_gym.datasets.dataset import (
LegacyDataset,
activate,
deactivate,
delete,
require,
)
from compiler_gym.bin.service import summarize_datasets
from compiler_gym.datasets.dataset import activate, deactivate, delete, require
from compiler_gym.util.flags.env_from_flags import env_from_flags
from compiler_gym.util.tabulate import tabulate

flags.DEFINE_list(
"download",
Expand Down Expand Up @@ -175,69 +148,34 @@
FLAGS = flags.FLAGS


def get_count_and_size_of_directory_contents(root: Path) -> Tuple[int, int]:
"""Return the number of files and combined size of a directory."""
count, size = 0, 0
for root, _, files in os.walk(str(root)):
count += len(files)
size += sum(os.path.getsize(f"{root}/{file}") for file in files)
return count, size


def enumerate_directory(name: str, path: Path):
rows = []
for path in path.iterdir():
if not path.is_file() or not path.name.endswith(".json"):
continue
dataset = LegacyDataset.from_json_file(path)
rows.append(
(dataset.name, dataset.license, dataset.file_count, dataset.size_bytes)
)
rows.append(("Total", "", sum(r[2] for r in rows), sum(r[3] for r in rows)))
return tabulate(
[(n, l, humanize.intcomma(f), humanize.naturalsize(s)) for n, l, f, s in rows],
headers=(name, "License", "#. Benchmarks", "Size on disk"),
)


def main(argv):
"""Main entry point."""
if len(argv) != 1:
raise app.UsageError(f"Unknown command line arguments: {argv[1:]}")

env = env_from_flags()
try:
if not env.datasets_site_path:
raise app.UsageError("Environment has no benchmarks site path")

env.datasets_site_path.mkdir(parents=True, exist_ok=True)
env.inactive_datasets_site_path.mkdir(parents=True, exist_ok=True)

invalidated_manifest = False

for name_or_url in FLAGS.download:
require(env, name_or_url)

if FLAGS.download_all:
for dataset in env.available_datasets:
require(env, dataset)
for dataset in env.datasets:
dataset.install()

for name in FLAGS.activate:
activate(env, name)
invalidated_manifest = True

if FLAGS.activate_all:
for path in env.inactive_datasets_site_path.iterdir():
activate(env, path.name)
invalidated_manifest = True

for name in FLAGS.deactivate:
deactivate(env, name)
invalidated_manifest = True

if FLAGS.deactivate_all:
for path in env.datasets_site_path.iterdir():
deactivate(env, path.name)
invalidated_manifest = True

for name in FLAGS.delete:
Expand All @@ -246,41 +184,10 @@ def main(argv):
if invalidated_manifest:
env.make_manifest_file()

print(f"{env.spec.id} benchmarks site dir: {env.datasets_site_path}")
print(f"{env.spec.id} benchmarks site dir: {env.datasets.site_data_path}")
print()
print(
enumerate_directory("Active Datasets", env.datasets_site_path),
)
print(
"These benchmarks are ready for use. Deactivate them using `--deactivate=<name>`."
)
print()
print(enumerate_directory("Inactive Datasets", env.inactive_datasets_site_path))
print("These benchmarks may be activated using `--activate=<name>`.")
print()
print(
tabulate(
sorted(
[
(
d.name,
d.license,
humanize.intcomma(d.file_count),
humanize.naturalsize(d.size_bytes),
)
for d in env.available_datasets.values()
]
),
headers=(
"Downloadable Dataset",
"License",
"#. Benchmarks",
"Size on disk",
),
)
)
print(
"These benchmarks may be installed using `--download=<name> --activate=<name>`."
summarize_datasets(env.datasets),
)
finally:
env.close()
Expand Down
34 changes: 29 additions & 5 deletions compiler_gym/bin/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,17 @@
$ python -m compiler_gym.bin.service --local_service_binary=/path/to/service/binary
"""
from typing import Iterable

import humanize
from absl import app, flags

from compiler_gym.datasets import Dataset
from compiler_gym.envs import CompilerEnv
from compiler_gym.spaces import Commandline
from compiler_gym.util.flags.env_from_flags import env_from_flags
from compiler_gym.util.tabulate import tabulate
from compiler_gym.util.truncate import truncate

flags.DEFINE_integer(
"heading_level",
Expand All @@ -93,18 +98,37 @@ def shape2str(shape, n: int = 80):
return f"`{string}`"


def summarize_datasets(datasets: Iterable[Dataset]) -> str:
rows = []
for dataset in datasets:
# Raw numeric values here, formatted below.
rows.append(
(
dataset.name,
truncate(dataset.description, max_line_len=60),
dataset.n,
dataset.site_data_size_in_bytes,
)
)
rows.append(("Total", "", sum(r[2] for r in rows), sum(r[3] for r in rows)))
return tabulate(
[
(n, l, humanize.intcomma(f) if f else "∞", humanize.naturalsize(s))
for n, l, f, s in rows
],
headers=("Dataset", "Description", "#. Benchmarks", "Size on disk"),
)


def print_service_capabilities(env: CompilerEnv, base_heading_level: int = 1):
"""Discover and print the capabilities of a CompilerGym service.
:param env: An environment.
"""
print(header(f"CompilerGym Service `{env.service}`", base_heading_level).strip())
print(header("Programs", base_heading_level + 1))
print(header("Datasets", base_heading_level + 1))
print(
tabulate(
[(p,) for p in sorted(env.benchmarks)],
headers=("Benchmark",),
)
summarize_datasets(env.datasets),
)
print(header("Observation Spaces", base_heading_level + 1))
print(
Expand Down
19 changes: 9 additions & 10 deletions compiler_gym/datasets/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,17 @@ load("@rules_python//python:defs.bzl", "py_library")

py_library(
name = "datasets",
srcs = ["__init__.py"],
visibility = ["//visibility:public"],
deps = [
":dataset",
srcs = [
"__init__.py",
"benchmark.py",
"dataset.py",
"datasets.py",
"tar_dataset.py",
],
)

py_library(
name = "dataset",
srcs = ["dataset.py"],
visibility = ["//compiler_gym:__subpackages__"],
visibility = ["//visibility:public"],
deps = [
"//compiler_gym:validation_result",
"//compiler_gym/service/proto",
"//compiler_gym/util",
],
)
17 changes: 16 additions & 1 deletion compiler_gym/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,27 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""Manage datasets of benchmarks."""
from compiler_gym.datasets.benchmark import Benchmark
from compiler_gym.datasets.dataset import (
Dataset,
LegacyDataset,
activate,
deactivate,
delete,
require,
)
from compiler_gym.datasets.datasets import Datasets
from compiler_gym.datasets.tar_dataset import TarDataset, TarDatasetWithManifest

__all__ = ["LegacyDataset", "require", "activate", "deactivate", "delete"]
__all__ = [
"activate",
"Benchmark",
"Dataset",
"Datasets",
"deactivate",
"delete",
"LegacyDataset",
"require",
"TarDataset",
"TarDatasetWithManifest",
]
63 changes: 63 additions & 0 deletions compiler_gym/datasets/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from pathlib import Path
from typing import Callable, List

from compiler_gym.service.proto import Benchmark as BenchmarkProto
from compiler_gym.service.proto import File
from compiler_gym.validation_result import ValidationResult


class Benchmark(object):

__slots__ = ["_uri", "_program_data", "_validation_callbacks"]

def __init__(self, uri: str, program_data: BenchmarkProto):
self._uri = uri
self._program_data = program_data
self._validation_callbacks = []

@property
def uri(self) -> str:
return self._uri

def program_data(self) -> BenchmarkProto: # The data that the service needs
return self._program_data

def is_validatable(self) -> bool:
return self._validation_callbacks != []

def validate(self) -> ValidationResult:
raise NotImplementedError

def validation_callbacks(
self,
) -> List[Callable[["CompilerEnv"], ValidationResult]]: # noqa: F821
"""Run any ad-hoc validation, e.g. difftest, valgrind, etc"""
return self._validation_callbacks

def add_validation_callback(
self,
validation_callback: Callable[["CompilerEnv"], ValidationResult], # noqa: F821
):
self._validation_callbacks.append(validation_callback)

def __repr__(self) -> str:
return str(self.uri)

@classmethod
def from_file(cls, uri: str, path: Path):
return cls(
uri=uri,
program_data=BenchmarkProto(
uri=uri, program=File(uri=f"file:///{Path(path).absolute()}")
),
)

@classmethod
def from_file_data(cls, uri: str, data: bytes):
return cls(
uri=uri, program_data=BenchmarkProto(uri=uri, program=File(contents=data))
)
Loading

0 comments on commit 07538a2

Please sign in to comment.