Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(config): Move device & amp args to PreTrainedConfig #812

Merged
merged 11 commits into from
Mar 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ test-act-ete-train:
--policy.dim_model=64 \
--policy.n_action_steps=20 \
--policy.chunk_size=20 \
--policy.device=$(DEVICE) \
--env.type=aloha \
--env.episode_length=5 \
--dataset.repo_id=lerobot/aloha_sim_transfer_cube_human \
Expand All @@ -61,7 +62,6 @@ test-act-ete-train:
--save_checkpoint=true \
--log_freq=1 \
--wandb.enable=false \
--device=$(DEVICE) \
--output_dir=tests/outputs/act/

test-act-ete-train-resume:
Expand All @@ -72,18 +72,19 @@ test-act-ete-train-resume:
test-act-ete-eval:
python lerobot/scripts/eval.py \
--policy.path=tests/outputs/act/checkpoints/000004/pretrained_model \
--policy.device=$(DEVICE) \
--env.type=aloha \
--env.episode_length=5 \
--eval.n_episodes=1 \
--eval.batch_size=1 \
--device=$(DEVICE)
--eval.batch_size=1

test-diffusion-ete-train:
python lerobot/scripts/train.py \
--policy.type=diffusion \
--policy.down_dims='[64,128,256]' \
--policy.diffusion_step_embed_dim=32 \
--policy.num_inference_steps=10 \
--policy.device=$(DEVICE) \
--env.type=pusht \
--env.episode_length=5 \
--dataset.repo_id=lerobot/pusht \
Expand All @@ -98,21 +99,21 @@ test-diffusion-ete-train:
--save_freq=2 \
--log_freq=1 \
--wandb.enable=false \
--device=$(DEVICE) \
--output_dir=tests/outputs/diffusion/

test-diffusion-ete-eval:
python lerobot/scripts/eval.py \
--policy.path=tests/outputs/diffusion/checkpoints/000002/pretrained_model \
--policy.device=$(DEVICE) \
--env.type=pusht \
--env.episode_length=5 \
--eval.n_episodes=1 \
--eval.batch_size=1 \
--device=$(DEVICE)
--eval.batch_size=1

test-tdmpc-ete-train:
python lerobot/scripts/train.py \
--policy.type=tdmpc \
--policy.device=$(DEVICE) \
--env.type=xarm \
--env.task=XarmLift-v0 \
--env.episode_length=5 \
Expand All @@ -128,15 +129,14 @@ test-tdmpc-ete-train:
--save_freq=2 \
--log_freq=1 \
--wandb.enable=false \
--device=$(DEVICE) \
--output_dir=tests/outputs/tdmpc/

test-tdmpc-ete-eval:
python lerobot/scripts/eval.py \
--policy.path=tests/outputs/tdmpc/checkpoints/000002/pretrained_model \
--policy.device=$(DEVICE) \
--env.type=xarm \
--env.episode_length=5 \
--env.task=XarmLift-v0 \
--eval.n_episodes=1 \
--eval.batch_size=1 \
--device=$(DEVICE)
--eval.batch_size=1
9 changes: 3 additions & 6 deletions lerobot/common/policies/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

import logging

import torch
from torch import nn

from lerobot.common.datasets.lerobot_dataset import LeRobotDatasetMetadata
Expand Down Expand Up @@ -76,7 +75,6 @@ def make_policy_config(policy_type: str, **kwargs) -> PreTrainedConfig:

def make_policy(
cfg: PreTrainedConfig,
device: str | torch.device,
ds_meta: LeRobotDatasetMetadata | None = None,
env_cfg: EnvConfig | None = None,
) -> PreTrainedPolicy:
Expand All @@ -88,15 +86,14 @@ def make_policy(
Args:
cfg (PreTrainedConfig): The config of the policy to make. If `pretrained_path` is set, the policy will
be loaded with the weights from that path.
device (str): the device to load the policy onto.
ds_meta (LeRobotDatasetMetadata | None, optional): Dataset metadata to take input/output shapes and
statistics to use for (un)normalization of inputs/outputs in the policy. Defaults to None.
env_cfg (EnvConfig | None, optional): The config of a gym environment to parse features from. Must be
provided if ds_meta is not. Defaults to None.

Raises:
ValueError: Either ds_meta or env and env_cfg must be provided.
NotImplementedError: if the policy.type is 'vqbet' and the device 'mps' (due to an incompatibility)
NotImplementedError: if the policy.type is 'vqbet' and the policy device 'mps' (due to an incompatibility)

Returns:
PreTrainedPolicy: _description_
Expand All @@ -111,7 +108,7 @@ def make_policy(
# https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment
# variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be
# slower than running natively on MPS.
if cfg.type == "vqbet" and str(device) == "mps":
if cfg.type == "vqbet" and cfg.device == "mps":
raise NotImplementedError(
"Current implementation of VQBeT does not support `mps` backend. "
"Please use `cpu` or `cuda` backend."
Expand Down Expand Up @@ -145,7 +142,7 @@ def make_policy(
# Make a fresh policy.
policy = policy_cls(**kwargs)

policy.to(device)
policy.to(cfg.device)
assert isinstance(policy, nn.Module)

# policy = torch.compile(policy, mode="reduce-overhead")
Expand Down
1 change: 1 addition & 0 deletions lerobot/common/policies/pi0/configuration_pi0.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ class PI0Config(PreTrainedConfig):
def __post_init__(self):
super().__post_init__()

# TODO(Steven): Validate device and amp? in all policy configs?
"""Input validation (not exhaustive)."""
if self.n_action_steps > self.chunk_size:
raise ValueError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def main():

cfg = PreTrainedConfig.from_pretrained(ckpt_torch_dir)
cfg.pretrained_path = ckpt_torch_dir
policy = make_policy(cfg, device, ds_meta=dataset.meta)
policy = make_policy(cfg, ds_meta=dataset.meta)

# policy = torch.compile(policy, mode="reduce-overhead")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def main():

cfg = PreTrainedConfig.from_pretrained(ckpt_torch_dir)
cfg.pretrained_path = ckpt_torch_dir
policy = make_policy(cfg, device, dataset_meta)
policy = make_policy(cfg, dataset_meta)

# loss_dict = policy.forward(batch, noise=noise, time=time_beta)
# loss_dict["loss"].backward()
Expand Down
7 changes: 3 additions & 4 deletions lerobot/common/policies/pretrained.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ def from_pretrained(
cache_dir: str | Path | None = None,
local_files_only: bool = False,
revision: str | None = None,
map_location: str = "cpu",
strict: bool = False,
**kwargs,
) -> T:
Expand All @@ -111,7 +110,7 @@ def from_pretrained(
if os.path.isdir(model_id):
print("Loading weights from local directory")
model_file = os.path.join(model_id, SAFETENSORS_SINGLE_FILE)
policy = cls._load_as_safetensor(instance, model_file, map_location, strict)
policy = cls._load_as_safetensor(instance, model_file, config.device, strict)
else:
try:
model_file = hf_hub_download(
Expand All @@ -125,13 +124,13 @@ def from_pretrained(
token=token,
local_files_only=local_files_only,
)
policy = cls._load_as_safetensor(instance, model_file, map_location, strict)
policy = cls._load_as_safetensor(instance, model_file, config.device, strict)
except HfHubHTTPError as e:
raise FileNotFoundError(
f"{SAFETENSORS_SINGLE_FILE} not found on the HuggingFace Hub in {model_id}"
) from e

policy.to(map_location)
policy.to(config.device)
policy.eval()
return policy

Expand Down
29 changes: 0 additions & 29 deletions lerobot/common/robot_devices/control_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from dataclasses import dataclass
from pathlib import Path

import draccus

from lerobot.common.robot_devices.robots.configs import RobotConfig
from lerobot.common.utils.utils import auto_select_torch_device, is_amp_available, is_torch_device_available
from lerobot.configs import parser
from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.train import TrainPipelineConfig


@dataclass
Expand Down Expand Up @@ -57,11 +54,6 @@ class RecordControlConfig(ControlConfig):
# Root directory where the dataset will be stored (e.g. 'dataset/path').
root: str | Path | None = None
policy: PreTrainedConfig | None = None
# TODO(rcadene, aliberts): By default, use device and use_amp values from policy checkpoint.
device: str | None = None # cuda | cpu | mps
# `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP,
# automatic gradient scaling is used.
use_amp: bool | None = None
# Limit the frames per second. By default, uses the policy fps.
fps: int | None = None
# Number of seconds before starting data collection. It allows the robot devices to warmup and synchronize.
Expand Down Expand Up @@ -104,27 +96,6 @@ def __post_init__(self):
self.policy = PreTrainedConfig.from_pretrained(policy_path, cli_overrides=cli_overrides)
self.policy.pretrained_path = policy_path

# When no device or use_amp are given, use the one from training config.
if self.device is None or self.use_amp is None:
train_cfg = TrainPipelineConfig.from_pretrained(policy_path)
if self.device is None:
self.device = train_cfg.device
if self.use_amp is None:
self.use_amp = train_cfg.use_amp

# Automatically switch to available device if necessary
if not is_torch_device_available(self.device):
auto_device = auto_select_torch_device()
logging.warning(f"Device '{self.device}' is not available. Switching to '{auto_device}'.")
self.device = auto_device

# Automatically deactivate AMP if necessary
if self.use_amp and not is_amp_available(self.device):
logging.warning(
f"Automatic Mixed Precision (amp) is not available on device '{self.device}'. Deactivating AMP."
)
self.use_amp = False


@ControlConfig.register_subclass("replay")
@dataclass
Expand Down
16 changes: 5 additions & 11 deletions lerobot/common/robot_devices/control_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from lerobot.common.datasets.image_writer import safe_stop_image_writer
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
from lerobot.common.datasets.utils import get_features_from_robot
from lerobot.common.policies.pretrained import PreTrainedPolicy
from lerobot.common.robot_devices.robots.utils import Robot
from lerobot.common.robot_devices.utils import busy_wait
from lerobot.common.utils.utils import get_safe_torch_device, has_method
Expand Down Expand Up @@ -193,8 +194,6 @@ def record_episode(
episode_time_s,
display_cameras,
policy,
device,
use_amp,
fps,
single_task,
):
Expand All @@ -205,8 +204,6 @@ def record_episode(
dataset=dataset,
events=events,
policy=policy,
device=device,
use_amp=use_amp,
fps=fps,
teleoperate=policy is None,
single_task=single_task,
Expand All @@ -221,9 +218,7 @@ def control_loop(
display_cameras=False,
dataset: LeRobotDataset | None = None,
events=None,
policy=None,
device: torch.device | str | None = None,
use_amp: bool | None = None,
policy: PreTrainedPolicy = None,
fps: int | None = None,
single_task: str | None = None,
):
Expand All @@ -246,9 +241,6 @@ def control_loop(
if dataset is not None and fps is not None and dataset.fps != fps:
raise ValueError(f"The dataset fps should be equal to requested fps ({dataset['fps']} != {fps}).")

if isinstance(device, str):
device = get_safe_torch_device(device)

timestamp = 0
start_episode_t = time.perf_counter()
while timestamp < control_time_s:
Expand All @@ -260,7 +252,9 @@ def control_loop(
observation = robot.capture_observation()

if policy is not None:
pred_action = predict_action(observation, policy, device, use_amp)
pred_action = predict_action(
observation, policy, get_safe_torch_device(policy.config.device), policy.config.use_amp
)
# Action can eventually be clipped using `max_relative_target`,
# so action actually sent is saved in the dataset.
action = robot.send_action(pred_action)
Expand Down
5 changes: 4 additions & 1 deletion lerobot/common/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,10 @@ def auto_select_torch_device() -> torch.device:
return torch.device("cpu")


# TODO(Steven): Remove log. log shouldn't be an argument, this should be handled by the logger level
def get_safe_torch_device(try_device: str, log: bool = False) -> torch.device:
"""Given a string, return a torch.device with checks on whether the device is available."""
try_device = str(try_device)
match try_device:
case "cuda":
assert torch.cuda.is_available()
Expand Down Expand Up @@ -85,14 +87,15 @@ def get_safe_dtype(dtype: torch.dtype, device: str | torch.device):


def is_torch_device_available(try_device: str) -> bool:
try_device = str(try_device) # Ensure try_device is a string
if try_device == "cuda":
return torch.cuda.is_available()
elif try_device == "mps":
return torch.backends.mps.is_available()
elif try_device == "cpu":
return True
else:
raise ValueError(f"Unknown device '{try_device}.")
raise ValueError(f"Unknown device {try_device}. Supported devices are: cuda, mps or cpu.")


def is_amp_available(device: str):
Expand Down
33 changes: 0 additions & 33 deletions lerobot/configs/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,9 @@
from pathlib import Path

from lerobot.common import envs, policies # noqa: F401
from lerobot.common.utils.utils import auto_select_torch_device, is_amp_available, is_torch_device_available
from lerobot.configs import parser
from lerobot.configs.default import EvalConfig
from lerobot.configs.policies import PreTrainedConfig
from lerobot.configs.train import TrainPipelineConfig


@dataclass
Expand All @@ -35,11 +33,6 @@ class EvalPipelineConfig:
policy: PreTrainedConfig | None = None
output_dir: Path | None = None
job_name: str | None = None
# TODO(rcadene, aliberts): By default, use device and use_amp values from policy checkpoint.
device: str | None = None # cuda | cpu | mps
# `use_amp` determines whether to use Automatic Mixed Precision (AMP) for training and evaluation. With AMP,
# automatic gradient scaling is used.
use_amp: bool = False
seed: int | None = 1000

def __post_init__(self):
Expand All @@ -50,27 +43,6 @@ def __post_init__(self):
self.policy = PreTrainedConfig.from_pretrained(policy_path, cli_overrides=cli_overrides)
self.policy.pretrained_path = policy_path

# When no device or use_amp are given, use the one from training config.
if self.device is None or self.use_amp is None:
train_cfg = TrainPipelineConfig.from_pretrained(policy_path)
if self.device is None:
self.device = train_cfg.device
if self.use_amp is None:
self.use_amp = train_cfg.use_amp

# Automatically switch to available device if necessary
if not is_torch_device_available(self.device):
auto_device = auto_select_torch_device()
logging.warning(f"Device '{self.device}' is not available. Switching to '{auto_device}'.")
self.device = auto_device

# Automatically deactivate AMP if necessary
if self.use_amp and not is_amp_available(self.device):
logging.warning(
f"Automatic Mixed Precision (amp) is not available on device '{self.device}'. Deactivating AMP."
)
self.use_amp = False

else:
logging.warning(
"No pretrained path was provided, evaluated policy will be built from scratch (random weights)."
Expand All @@ -87,11 +59,6 @@ def __post_init__(self):
eval_dir = f"{now:%Y-%m-%d}/{now:%H-%M-%S}_{self.job_name}"
self.output_dir = Path("outputs/eval") / eval_dir

if self.device is None:
raise ValueError("Set one of the following device: cuda, cpu or mps")
elif self.device == "cuda" and self.use_amp is None:
raise ValueError("Set 'use_amp' to True or False.")

@classmethod
def __get_path_fields__(cls) -> list[str]:
"""This enables the parser to load config from the policy using `--policy.path=local/dir`"""
Expand Down
Loading
Loading