Skip to content

Commit

Permalink
Bump transformers to 4.29.2 (#389)
Browse files Browse the repository at this point in the history
* bump transformers version to 4.28.1

* update check min version to 4.28

* Update quality tooling to use ruff

* Update examples scripts and diff files

* Fix imports

* Some trivial model patching

* Move encoder output buffer copying into _call_generate

* Some upstream whisper updates

* Update token classification pipeline

* Style

* Fix check_min_version imports

* Add should_log property to training args

* Style

* Some trivial pipeline test updates

* Bump to 4.29.2

* Update examples and diff files

* Decouple MT5 modules from T5 modules as upstream

* Style

* Bump outdated min optimum gc versions for examples
  • Loading branch information
katalinic-gc authored Jun 2, 2023
1 parent 97c11c3 commit 83c8eef
Show file tree
Hide file tree
Showing 95 changed files with 1,258 additions and 965 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_pr_documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
cd doc-builder
git pull origin main
pip install .
pip install black
pip install .[quality]
cd ..
- name: Make documentation
Expand Down
9 changes: 4 additions & 5 deletions .github/workflows/check_code_quality.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,12 @@ jobs:
run: |
source venv/bin/activate
pip install --upgrade pip
pip install isort
pip install black
pip install .[quality]
- name: Check style with black
run: |
source venv/bin/activate
black --check .
- name: Check style with isort
black --check examples tests optimum
- name: Check style with ruff
run: |
source venv/bin/activate
isort --check .
ruff examples tests optimum
10 changes: 6 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,16 @@ REAL_CLONE_NAME = $(if $(CLONE_NAME),$(CLONE_NAME),$(DEFAULT_CLONE_NAME))

.PHONY: style test

check_dirs := examples tests optimum

# Run code quality checks
style_check:
black --check .
isort --check .
black --check $(check_dirs)
ruff $(check_dirs)

style:
black .
isort .
black $(check_dirs)
ruff $(check_dirs) --fix

# Run tests for the library
test:
Expand Down
18 changes: 11 additions & 7 deletions examples/audio-classification/run_audio_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,23 @@
from typing import Optional

import datasets
import numpy as np
from datasets import DatasetDict, load_dataset

import evaluate
import numpy as np
import transformers
from optimum.graphcore import IPUConfig, IPUTrainer
from optimum.graphcore import IPUTrainingArguments as TrainingArguments
from datasets import DatasetDict, load_dataset
from transformers import AutoConfig, AutoFeatureExtractor, AutoModelForAudioClassification, HfArgumentParser, set_seed
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

from optimum.graphcore import IPUConfig, IPUTrainer
from optimum.graphcore import IPUTrainingArguments as TrainingArguments


logger = logging.getLogger(__name__)

# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.25.0")
check_min_version("4.29.0")

require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")

Expand Down Expand Up @@ -203,6 +203,10 @@ def main():
handlers=[logging.StreamHandler(sys.stdout)],
)

if training_args.should_log:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers.utils.logging.set_verbosity_info()

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
transformers.utils.logging.set_verbosity(log_level)
Expand Down Expand Up @@ -278,7 +282,7 @@ def main():
# Prepare label mappings.
# We'll include these in the model's config to get human readable labels in the Inference API.
labels = raw_datasets["train"].features[data_args.label_column_name].names
label2id, id2label = dict(), dict()
label2id, id2label = {}, {}
for i, label in enumerate(labels):
label2id[label] = str(i)
id2label[str(i)] = label
Expand Down
43 changes: 23 additions & 20 deletions examples/image-classification/run_image_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
from dataclasses import dataclass, field
from typing import Optional

import evaluate
import numpy as np
import torch
import transformers
from datasets import load_dataset
from PIL import Image
from torchvision.transforms import (
Expand All @@ -32,35 +34,32 @@
Resize,
ToTensor,
)

import evaluate
import transformers
from optimum.graphcore import IPUConfig, IPUTrainer
from optimum.graphcore import IPUTrainingArguments as TrainingArguments
from optimum.graphcore.utils import check_min_version
from transformers import (
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
AutoConfig,
AutoFeatureExtractor,
AutoImageProcessor,
AutoModelForImageClassification,
HfArgumentParser,
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version as tf_check_min_version
from transformers.utils import send_example_telemetry
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

from optimum.graphcore import IPUConfig, IPUTrainer
from optimum.graphcore import IPUTrainingArguments as TrainingArguments
from optimum.graphcore.utils import check_min_version as gc_check_min_version


""" Fine-tuning a 🤗 Transformers model for image classification"""

logger = logging.getLogger(__name__)

# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
tf_check_min_version("4.25.0")
check_min_version("4.29.0")

# Will error if the minimal version of Optimum Graphcore is not installed. Remove at your own risks.
check_min_version("0.2.4.dev0")
gc_check_min_version("0.6.0.dev0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

Expand Down Expand Up @@ -146,7 +145,7 @@ class ModelArguments:
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
use_auth_token: bool = field(
default=False,
metadata={
Expand Down Expand Up @@ -212,6 +211,10 @@ def main():
handlers=[logging.StreamHandler(sys.stdout)],
)

if training_args.should_log:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers.utils.logging.set_verbosity_info()

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
transformers.utils.logging.set_verbosity(log_level)
Expand Down Expand Up @@ -270,7 +273,7 @@ def main():
# Prepare label mappings.
# We'll include these in the model's config to get human readable labels in the Inference API.
labels = dataset["train"].features["labels"].names
label2id, id2label = dict(), dict()
label2id, id2label = {}, {}
for i, label in enumerate(labels):
label2id[label] = str(i)
id2label[str(i)] = label
Expand Down Expand Up @@ -309,19 +312,19 @@ def compute_metrics(p):
use_auth_token=True if model_args.use_auth_token else None,
ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.feature_extractor_name or model_args.model_name_or_path,
image_processor = AutoImageProcessor.from_pretrained(
model_args.image_processor_name or model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)

# Define torchvision transforms to be applied to each image.
if "shortest_edge" in feature_extractor.size:
size = feature_extractor.size["shortest_edge"]
if "shortest_edge" in image_processor.size:
size = image_processor.size["shortest_edge"]
else:
size = (feature_extractor.size["height"], feature_extractor.size["width"])
normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
size = (image_processor.size["height"], image_processor.size["width"])
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
_train_transforms = [
RandomResizedCrop(size),
RandomHorizontalFlip(),
Expand Down Expand Up @@ -370,7 +373,7 @@ def compute_metrics(p):
train_dataset=dataset["train"] if training_args.do_train else None,
eval_dataset=dataset["validation"] if training_args.do_eval else None,
compute_metrics=compute_metrics,
tokenizer=feature_extractor,
tokenizer=image_processor,
data_collator=collate_fn,
)

Expand Down
1 change: 0 additions & 1 deletion examples/language-modeling/prepare_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from typing import Optional, Sequence, Union

from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset

from joblib import Parallel, delayed


Expand Down
56 changes: 43 additions & 13 deletions examples/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,9 @@
from typing import Optional

import datasets
from datasets import load_dataset

import torch
import transformers
from optimum.graphcore import IPUConfig, IPUTrainer
from optimum.graphcore import IPUTrainingArguments as TrainingArguments
from optimum.graphcore.data import pad_on_batch_axis
from datasets import load_dataset
from transformers import (
CONFIG_MAPPING,
MODEL_FOR_CAUSAL_LM_MAPPING,
Expand All @@ -48,13 +45,15 @@
)
from transformers.testing_utils import CaptureLogger
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version as tf_check_min_version
from transformers.utils import send_example_telemetry
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

from optimum.graphcore import IPUConfig, IPUTrainer
from optimum.graphcore import IPUTrainingArguments as TrainingArguments


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
tf_check_min_version("4.25.0")
check_min_version("4.29.0")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

Expand Down Expand Up @@ -119,6 +118,25 @@ class ModelArguments:
)
},
)
torch_dtype: Optional[str] = field(
default=None,
metadata={
"help": (
"Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
"dtype will be automatically derived from the model's weights."
),
"choices": ["auto", "float16", "float32"],
},
)
low_cpu_mem_usage: bool = field(
default=False,
metadata={
"help": (
"It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded."
"set True will benefit LLM loading time and RAM consumption."
)
},
)


@dataclass
Expand Down Expand Up @@ -220,6 +238,10 @@ def main():
handlers=[logging.StreamHandler(sys.stdout)],
)

if training_args.should_log:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers.utils.logging.set_verbosity_info()

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
Expand Down Expand Up @@ -373,17 +395,24 @@ def main():
)

if model_args.model_name_or_path:
torch_dtype = (
model_args.torch_dtype
if model_args.torch_dtype in ["auto", None]
else getattr(torch, model_args.torch_dtype)
)
model = AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
torch_dtype=torch_dtype,
low_cpu_mem_usage=model_args.low_cpu_mem_usage,
)
else:
model = AutoModelForCausalLM.from_config(config)
n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")

# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
Expand All @@ -395,9 +424,9 @@ def main():
# Preprocessing the datasets.
# First we tokenize all the texts.
if training_args.do_train:
column_names = raw_datasets["train"].column_names
column_names = list(raw_datasets["train"].features)
else:
column_names = raw_datasets["validation"].column_names
column_names = list(raw_datasets["validation"].features)
text_column_name = "text" if "text" in column_names else column_names[0]

# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
Expand Down Expand Up @@ -428,8 +457,9 @@ def tokenize_function(examples):
block_size = tokenizer.model_max_length
if block_size > 1024:
logger.warning(
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
"Picking 1024 instead. You can change that default value by passing --block_size xxx."
"The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
" of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
" override this default with `--block_size xxx`."
)
block_size = 1024
else:
Expand Down
Loading

0 comments on commit 83c8eef

Please sign in to comment.