WIP: New dataset API.

facebookresearch · Mar 20, 2021 · 07538a2 · 07538a2
1 parent b1760a6
commit 07538a2
Show file tree

Hide file tree

Showing 47 changed files with 1,595 additions and 861 deletions.
diff --git a/compiler_gym/bin/BUILD b/compiler_gym/bin/BUILD
@@ -22,7 +22,7 @@ py_binary(
     srcs = ["datasets.py"],
     visibility = ["//visibility:public"],
     deps = [
-        "//compiler_gym/datasets:dataset",
+        ":service",
         "//compiler_gym/envs",
         "//compiler_gym/util",
         "//compiler_gym/util/flags:env_from_flags",
@@ -83,6 +83,7 @@ py_binary(
     srcs = ["service.py"],
     visibility = ["//visibility:public"],
     deps = [
+        "//compiler_gym/datasets",
         "//compiler_gym/envs",
         "//compiler_gym/spaces",
         "//compiler_gym/util",

diff --git a/compiler_gym/bin/datasets.py b/compiler_gym/bin/datasets.py
@@ -33,23 +33,6 @@
     +-------------------+--------------+-----------------+----------------+
     These benchmarks are ready for use. Deactivate them using `--deactivate=<name>`.
 
-    +---------------------+-----------+-----------------+----------------+
-    | Inactive Datasets   | License   |   #. Benchmarks | Size on disk   |
-    +=====================+===========+=================+================+
-    | Total               |           |               0 | 0 Bytes        |
-    +---------------------+-----------+-----------------+----------------+
-    These benchmarks may be activated using `--activate=<name>`.
-
-    +------------------------+---------------------------------+-----------------+----------------+
-    | Downloadable Dataset   | License                         | #. Benchmarks   | Size on disk   |
-    +========================+=================================+=================+================+
-    | blas-v0                | BSD 3-Clause                    | 300             | 4.0 MB         |
-    +------------------------+---------------------------------+-----------------+----------------+
-    | polybench-v0           | BSD 3-Clause                    | 27              | 162.6 kB       |
-    +------------------------+---------------------------------+-----------------+----------------+
-    These benchmarks may be installed using `--download=<name> --activate=<name>`.
-
-
 Downloading datasets
 --------------------
 
@@ -131,23 +114,13 @@
 A :code:`--delete_all` flag can be used to delete all of the locally installed
 datasets.
 """
-import os
 import sys
-from pathlib import Path
-from typing import Tuple
 
-import humanize
 from absl import app, flags
 
-from compiler_gym.datasets.dataset import (
-    LegacyDataset,
-    activate,
-    deactivate,
-    delete,
-    require,
-)
+from compiler_gym.bin.service import summarize_datasets
+from compiler_gym.datasets.dataset import activate, deactivate, delete, require
 from compiler_gym.util.flags.env_from_flags import env_from_flags
-from compiler_gym.util.tabulate import tabulate
 
 flags.DEFINE_list(
     "download",
@@ -175,69 +148,34 @@
 FLAGS = flags.FLAGS
 
 
-def get_count_and_size_of_directory_contents(root: Path) -> Tuple[int, int]:
-    """Return the number of files and combined size of a directory."""
-    count, size = 0, 0
-    for root, _, files in os.walk(str(root)):
-        count += len(files)
-        size += sum(os.path.getsize(f"{root}/{file}") for file in files)
-    return count, size
-
-
-def enumerate_directory(name: str, path: Path):
-    rows = []
-    for path in path.iterdir():
-        if not path.is_file() or not path.name.endswith(".json"):
-            continue
-        dataset = LegacyDataset.from_json_file(path)
-        rows.append(
-            (dataset.name, dataset.license, dataset.file_count, dataset.size_bytes)
-        )
-    rows.append(("Total", "", sum(r[2] for r in rows), sum(r[3] for r in rows)))
-    return tabulate(
-        [(n, l, humanize.intcomma(f), humanize.naturalsize(s)) for n, l, f, s in rows],
-        headers=(name, "License", "#. Benchmarks", "Size on disk"),
-    )
-
-
 def main(argv):
     """Main entry point."""
     if len(argv) != 1:
         raise app.UsageError(f"Unknown command line arguments: {argv[1:]}")
 
     env = env_from_flags()
     try:
-        if not env.datasets_site_path:
-            raise app.UsageError("Environment has no benchmarks site path")
-
-        env.datasets_site_path.mkdir(parents=True, exist_ok=True)
-        env.inactive_datasets_site_path.mkdir(parents=True, exist_ok=True)
-
         invalidated_manifest = False
 
         for name_or_url in FLAGS.download:
             require(env, name_or_url)
 
         if FLAGS.download_all:
-            for dataset in env.available_datasets:
-                require(env, dataset)
+            for dataset in env.datasets:
+                dataset.install()
 
         for name in FLAGS.activate:
             activate(env, name)
             invalidated_manifest = True
 
         if FLAGS.activate_all:
-            for path in env.inactive_datasets_site_path.iterdir():
-                activate(env, path.name)
             invalidated_manifest = True
 
         for name in FLAGS.deactivate:
             deactivate(env, name)
             invalidated_manifest = True
 
         if FLAGS.deactivate_all:
-            for path in env.datasets_site_path.iterdir():
-                deactivate(env, path.name)
             invalidated_manifest = True
 
         for name in FLAGS.delete:
@@ -246,41 +184,10 @@ def main(argv):
         if invalidated_manifest:
             env.make_manifest_file()
 
-        print(f"{env.spec.id} benchmarks site dir: {env.datasets_site_path}")
+        print(f"{env.spec.id} benchmarks site dir: {env.datasets.site_data_path}")
         print()
         print(
-            enumerate_directory("Active Datasets", env.datasets_site_path),
-        )
-        print(
-            "These benchmarks are ready for use. Deactivate them using `--deactivate=<name>`."
-        )
-        print()
-        print(enumerate_directory("Inactive Datasets", env.inactive_datasets_site_path))
-        print("These benchmarks may be activated using `--activate=<name>`.")
-        print()
-        print(
-            tabulate(
-                sorted(
-                    [
-                        (
-                            d.name,
-                            d.license,
-                            humanize.intcomma(d.file_count),
-                            humanize.naturalsize(d.size_bytes),
-                        )
-                        for d in env.available_datasets.values()
-                    ]
-                ),
-                headers=(
-                    "Downloadable Dataset",
-                    "License",
-                    "#. Benchmarks",
-                    "Size on disk",
-                ),
-            )
-        )
-        print(
-            "These benchmarks may be installed using `--download=<name> --activate=<name>`."
+            summarize_datasets(env.datasets),
         )
     finally:
         env.close()

diff --git a/compiler_gym/bin/service.py b/compiler_gym/bin/service.py
@@ -66,12 +66,17 @@
 
     $ python -m compiler_gym.bin.service --local_service_binary=/path/to/service/binary
 """
+from typing import Iterable
+
+import humanize
 from absl import app, flags
 
+from compiler_gym.datasets import Dataset
 from compiler_gym.envs import CompilerEnv
 from compiler_gym.spaces import Commandline
 from compiler_gym.util.flags.env_from_flags import env_from_flags
 from compiler_gym.util.tabulate import tabulate
+from compiler_gym.util.truncate import truncate
 
 flags.DEFINE_integer(
     "heading_level",
@@ -93,18 +98,37 @@ def shape2str(shape, n: int = 80):
     return f"`{string}`"
 
 
+def summarize_datasets(datasets: Iterable[Dataset]) -> str:
+    rows = []
+    for dataset in datasets:
+        # Raw numeric values here, formatted below.
+        rows.append(
+            (
+                dataset.name,
+                truncate(dataset.description, max_line_len=60),
+                dataset.n,
+                dataset.site_data_size_in_bytes,
+            )
+        )
+    rows.append(("Total", "", sum(r[2] for r in rows), sum(r[3] for r in rows)))
+    return tabulate(
+        [
+            (n, l, humanize.intcomma(f) if f else "∞", humanize.naturalsize(s))
+            for n, l, f, s in rows
+        ],
+        headers=("Dataset", "Description", "#. Benchmarks", "Size on disk"),
+    )
+
+
 def print_service_capabilities(env: CompilerEnv, base_heading_level: int = 1):
     """Discover and print the capabilities of a CompilerGym service.
 
     :param env: An environment.
     """
     print(header(f"CompilerGym Service `{env.service}`", base_heading_level).strip())
-    print(header("Programs", base_heading_level + 1))
+    print(header("Datasets", base_heading_level + 1))
     print(
-        tabulate(
-            [(p,) for p in sorted(env.benchmarks)],
-            headers=("Benchmark",),
-        )
+        summarize_datasets(env.datasets),
     )
     print(header("Observation Spaces", base_heading_level + 1))
     print(

diff --git a/compiler_gym/datasets/BUILD b/compiler_gym/datasets/BUILD
@@ -6,18 +6,17 @@ load("@rules_python//python:defs.bzl", "py_library")
 
 py_library(
     name = "datasets",
-    srcs = ["__init__.py"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":dataset",
+    srcs = [
+        "__init__.py",
+        "benchmark.py",
+        "dataset.py",
+        "datasets.py",
+        "tar_dataset.py",
     ],
-)
-
-py_library(
-    name = "dataset",
-    srcs = ["dataset.py"],
-    visibility = ["//compiler_gym:__subpackages__"],
+    visibility = ["//visibility:public"],
     deps = [
+        "//compiler_gym:validation_result",
+        "//compiler_gym/service/proto",
         "//compiler_gym/util",
     ],
 )
diff --git a/compiler_gym/datasets/__init__.py b/compiler_gym/datasets/__init__.py
@@ -3,12 +3,27 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 """Manage datasets of benchmarks."""
+from compiler_gym.datasets.benchmark import Benchmark
 from compiler_gym.datasets.dataset import (
+    Dataset,
     LegacyDataset,
     activate,
     deactivate,
     delete,
     require,
 )
+from compiler_gym.datasets.datasets import Datasets
+from compiler_gym.datasets.tar_dataset import TarDataset, TarDatasetWithManifest
 
-__all__ = ["LegacyDataset", "require", "activate", "deactivate", "delete"]
+__all__ = [
+    "activate",
+    "Benchmark",
+    "Dataset",
+    "Datasets",
+    "deactivate",
+    "delete",
+    "LegacyDataset",
+    "require",
+    "TarDataset",
+    "TarDatasetWithManifest",
+]
diff --git a/compiler_gym/datasets/benchmark.py b/compiler_gym/datasets/benchmark.py
@@ -0,0 +1,63 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from pathlib import Path
+from typing import Callable, List
+
+from compiler_gym.service.proto import Benchmark as BenchmarkProto
+from compiler_gym.service.proto import File
+from compiler_gym.validation_result import ValidationResult
+
+
+class Benchmark(object):
+
+    __slots__ = ["_uri", "_program_data", "_validation_callbacks"]
+
+    def __init__(self, uri: str, program_data: BenchmarkProto):
+        self._uri = uri
+        self._program_data = program_data
+        self._validation_callbacks = []
+
+    @property
+    def uri(self) -> str:
+        return self._uri
+
+    def program_data(self) -> BenchmarkProto:  # The data that the service needs
+        return self._program_data
+
+    def is_validatable(self) -> bool:
+        return self._validation_callbacks != []
+
+    def validate(self) -> ValidationResult:
+        raise NotImplementedError
+
+    def validation_callbacks(
+        self,
+    ) -> List[Callable[["CompilerEnv"], ValidationResult]]:  # noqa: F821
+        """Run any ad-hoc validation, e.g. difftest, valgrind, etc"""
+        return self._validation_callbacks
+
+    def add_validation_callback(
+        self,
+        validation_callback: Callable[["CompilerEnv"], ValidationResult],  # noqa: F821
+    ):
+        self._validation_callbacks.append(validation_callback)
+
+    def __repr__(self) -> str:
+        return str(self.uri)
+
+    @classmethod
+    def from_file(cls, uri: str, path: Path):
+        return cls(
+            uri=uri,
+            program_data=BenchmarkProto(
+                uri=uri, program=File(uri=f"file:///{Path(path).absolute()}")
+            ),
+        )
+
+    @classmethod
+    def from_file_data(cls, uri: str, data: bytes):
+        return cls(
+            uri=uri, program_data=BenchmarkProto(uri=uri, program=File(contents=data))
+        )