Skip to content

Commit

Permalink
[datasets] Switch CompilerEnv to the new dataset API.
Browse files Browse the repository at this point in the history
Issue #45.
  • Loading branch information
ChrisCummins committed Apr 27, 2021
1 parent 8ccae67 commit 336d959
Show file tree
Hide file tree
Showing 199 changed files with 2,242 additions and 3,917 deletions.
4 changes: 2 additions & 2 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ exports_files([
py_library(
name = "CompilerGym",
data = [
"//compiler_gym/third_party/cBench:benchmarks_list",
"//compiler_gym/third_party/cBench:crc32",
"//compiler_gym/third_party/cbench:benchmarks_list",
"//compiler_gym/third_party/cbench:crc32",
],
deps = [
"//compiler_gym",
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/bench_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@
# adpcm is small and jpeg-d is large. ghostscript is the largest but that
# one takes too long.
@pytest.fixture(
params=["cBench-v1/crc32", "cBench-v1/jpeg-d"],
params=["cbench-v1/crc32", "cbench-v1/jpeg-d"],
ids=["fast_benchmark", "slow_benchmark"],
)
def benchmark_name(request) -> str:
yield request.param


@pytest.fixture(params=["cBench-v1/crc32"], ids=["fast_benchmark"])
@pytest.fixture(params=["cbench-v1/crc32"], ids=["fast_benchmark"])
def fast_benchmark_name(request) -> str:
yield request.param

Expand Down
2 changes: 2 additions & 0 deletions compiler_gym/bin/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ py_binary(
srcs = ["datasets.py"],
visibility = ["//visibility:public"],
deps = [
":service",
"//compiler_gym/datasets",
"//compiler_gym/envs",
"//compiler_gym/util",
Expand Down Expand Up @@ -81,6 +82,7 @@ py_binary(
srcs = ["service.py"],
visibility = ["//visibility:public"],
deps = [
"//compiler_gym/datasets",
"//compiler_gym/envs",
"//compiler_gym/spaces",
"//compiler_gym/util",
Expand Down
100 changes: 16 additions & 84 deletions compiler_gym/bin/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,23 +131,14 @@
A :code:`--delete_all` flag can be used to delete all of the locally installed
datasets.
"""
import os
import sys
from pathlib import Path
from typing import Tuple

import humanize
from absl import app, flags
from deprecated.sphinx import deprecated

from compiler_gym.datasets.dataset import (
LegacyDataset,
activate,
deactivate,
delete,
require,
)
from compiler_gym.bin.service import summarize_datasets
from compiler_gym.datasets.dataset import activate, deactivate, delete
from compiler_gym.util.flags.env_from_flags import env_from_flags
from compiler_gym.util.tabulate import tabulate

flags.DEFINE_list(
"download",
Expand Down Expand Up @@ -175,69 +166,43 @@
FLAGS = flags.FLAGS


def get_count_and_size_of_directory_contents(root: Path) -> Tuple[int, int]:
"""Return the number of files and combined size of a directory."""
count, size = 0, 0
for root, _, files in os.walk(str(root)):
count += len(files)
size += sum(os.path.getsize(f"{root}/{file}") for file in files)
return count, size


def enumerate_directory(name: str, path: Path):
rows = []
for path in path.iterdir():
if not path.is_file() or not path.name.endswith(".json"):
continue
dataset = LegacyDataset.from_json_file(path)
rows.append(
(dataset.name, dataset.license, dataset.file_count, dataset.size_bytes)
)
rows.append(("Total", "", sum(r[2] for r in rows), sum(r[3] for r in rows)))
return tabulate(
[(n, l, humanize.intcomma(f), humanize.naturalsize(s)) for n, l, f, s in rows],
headers=(name, "License", "#. Benchmarks", "Size on disk"),
)


@deprecated(
version="0.1.8",
reason=(
"Command-line management of datasets is deprecated. Please use "
":mod:`compiler_gym.bin.service` to print a tabular overview of the "
"available datasets. For management of datasets, use the "
":class:`env.datasets <compiler_gym.env>` property."
),
)
def main(argv):
"""Main entry point."""
if len(argv) != 1:
raise app.UsageError(f"Unknown command line arguments: {argv[1:]}")

env = env_from_flags()
try:
if not env.datasets_site_path:
raise app.UsageError("Environment has no benchmarks site path")

env.datasets_site_path.mkdir(parents=True, exist_ok=True)
env.inactive_datasets_site_path.mkdir(parents=True, exist_ok=True)

invalidated_manifest = False

for name_or_url in FLAGS.download:
require(env, name_or_url)
env.datasets.install(name_or_url)

if FLAGS.download_all:
for dataset in env.available_datasets:
require(env, dataset)
for dataset in env.datasets:
dataset.install()

for name in FLAGS.activate:
activate(env, name)
invalidated_manifest = True

if FLAGS.activate_all:
for path in env.inactive_datasets_site_path.iterdir():
activate(env, path.name)
invalidated_manifest = True

for name in FLAGS.deactivate:
deactivate(env, name)
invalidated_manifest = True

if FLAGS.deactivate_all:
for path in env.datasets_site_path.iterdir():
deactivate(env, path.name)
invalidated_manifest = True

for name in FLAGS.delete:
Expand All @@ -246,41 +211,8 @@ def main(argv):
if invalidated_manifest:
env.make_manifest_file()

print(f"{env.spec.id} benchmarks site dir: {env.datasets_site_path}")
print()
print(
enumerate_directory("Active Datasets", env.datasets_site_path),
)
print(
"These benchmarks are ready for use. Deactivate them using `--deactivate=<name>`."
)
print()
print(enumerate_directory("Inactive Datasets", env.inactive_datasets_site_path))
print("These benchmarks may be activated using `--activate=<name>`.")
print()
print(
tabulate(
sorted(
[
(
d.name,
d.license,
humanize.intcomma(d.file_count),
humanize.naturalsize(d.size_bytes),
)
for d in env.available_datasets.values()
]
),
headers=(
"Downloadable Dataset",
"License",
"#. Benchmarks",
"Size on disk",
),
)
)
print(
"These benchmarks may be installed using `--download=<name> --activate=<name>`."
summarize_datasets(env.datasets),
)
finally:
env.close()
Expand Down
Loading

0 comments on commit 336d959

Please sign in to comment.