Skip to content

Commit

Permalink
bug_fix (#3184)
Browse files Browse the repository at this point in the history
  • Loading branch information
lugimzzz authored Sep 5, 2022
1 parent 47a2ea5 commit ab2bd21
Show file tree
Hide file tree
Showing 17 changed files with 231 additions and 227 deletions.
16 changes: 12 additions & 4 deletions applications/text_classification/hierarchical/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
wget https://paddlenlp.bj.bcebos.com/datasets/baidu_extract_2020.tar.gz
tar -zxvf baidu_extract_2020.tar.gz
mv baidu_extract_2020 data
rm baidu_extract_2020.tar.gz
```

<div align="center">
Expand Down Expand Up @@ -194,6 +195,7 @@ data/
使用CPU/GPU训练,默认为GPU训练,使用CPU训练只需将设备参数配置改为`--device "cpu"`
```shell
python train.py \
--dataset_dir "data" \
--device "gpu" \
--max_seq_length 128 \
--model_name "ernie-3.0-medium-zh" \
Expand All @@ -205,6 +207,7 @@ python train.py \
如果在CPU环境下训练,可以指定`nproc_per_node`参数进行多核训练:
```shell
python -m paddle.distributed.launch --nproc_per_node 8 --backend "gloo" train.py \
--dataset_dir "data" \
--device "gpu" \
--max_seq_length 128 \
--model_name "ernie-3.0-medium-zh" \
Expand All @@ -217,6 +220,7 @@ python -m paddle.distributed.launch --nproc_per_node 8 --backend "gloo" train.py
```shell
unset CUDA_VISIBLE_DEVICES
python -m paddle.distributed.launch --gpus "0" train.py \
--dataset_dir "data" \
--device "gpu" \
--max_seq_length 128 \
--model_name "ernie-3.0-medium-zh" \
Expand Down Expand Up @@ -260,13 +264,13 @@ checkpoint/
**NOTE:**
* 如需恢复模型训练,则可以设置 `--init_from_ckpt checkpoint/model_state.pdparams`
* 如需训练英文文本分类任务,只需更换预训练模型参数 `model_name` 。英文训练任务推荐使用"ernie-2.0-base-en",更多可选模型可参考[Transformer预训练模型](https://paddlenlp.readthedocs.io/zh/latest/model_zoo/index.html#transformer)

* 英文和中文以外文本分类任务建议使用多语言预训练模型"ernie-m-base","ernie-m-large", 多语言模型暂不支持文本分类模型部署,相关功能正在加速开发中。
#### 2.4.2 训练评估与模型优化

训练后的模型我们可以使用 [模型分析模块](./analysis) 对每个类别分别进行评估,并输出预测错误样本(bad case),默认在GPU环境下使用,在CPU环境下修改参数配置为`--device "cpu"`:

```shell
python analysis/evaluate.py --device "gpu" --max_seq_length 128 --batch_size 32 --bad_case_path "./bad_case.txt"
python analysis/evaluate.py --device "gpu" --max_seq_length 128 --batch_size 32 --bad_case_path "./bad_case.txt" --dataset_dir "data" --params_path "./checkpoint"
```

输出打印示例:
Expand Down Expand Up @@ -307,7 +311,7 @@ Prediction Label Text
训练结束后,输入待预测数据(data.txt)和类别标签对照列表(label.txt),使用训练好的模型进行,默认在GPU环境下使用,在CPU环境下修改参数配置为`--device "cpu"`

```shell
python predict.py --device "gpu" --max_seq_length 128 --batch_size 32
python predict.py --device "gpu" --max_seq_length 128 --batch_size 32 --dataset_dir "data"
```

可支持配置的参数:
Expand Down Expand Up @@ -361,10 +365,14 @@ pip install paddleslim==2.2.2
```shell
python prune.py \
--device "gpu" \
--dataset_dir "data" \
--output_dir "prune" \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--num_train_epochs 10 \
--max_seq_length 128 \
--logging_steps 5 \
--save_steps 100 \
--width_mult_list '3/4' '2/3' '1/2'
```

Expand All @@ -376,7 +384,7 @@ python prune.py \
* `per_device_eval_batch_size`:开发集评测过程批处理大小,请结合显存情况进行调整,若出现显存不足,请适当调低这一参数;默认为32。
* `learning_rate`:训练最大学习率;默认为3e-5。
* `num_train_epochs`: 训练轮次,使用早停法时可以选择100;默认为10。
* `logging_steps`: 训练过程中日志打印的间隔steps数,默认5
* `logging_steps`: 训练过程中日志打印的间隔steps数,默认100
* `save_steps`: 训练过程中保存模型checkpoint的间隔steps数,默认100。
* `seed`:随机种子,默认为3。
* `width_mult_list`:裁剪宽度(multi head)保留的比例列表,表示对self_attention中的 `q``k``v` 以及 `ffn` 权重宽度的保留比例,保留比例乘以宽度(multi haed数量)应为整数;默认是None。
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,8 @@ def evaluate():
probs = []
labels = []
for batch in train_data_loader:
input_ids, token_type_ids, label = batch['input_ids'], batch[
'token_type_ids'], batch['labels']
logits = model(input_ids, token_type_ids)
label = batch.pop("labels")
logits = model(**batch)
labels.extend(label.numpy())
probs.extend(F.sigmoid(logits).numpy())
probs = np.array(probs)
Expand All @@ -158,9 +157,8 @@ def evaluate():
probs = []
labels = []
for batch in dev_data_loader:
input_ids, token_type_ids, label = batch['input_ids'], batch[
'token_type_ids'], batch['labels']
logits = model(input_ids, token_type_ids)
label = batch.pop("labels")
logits = model(**batch)
labels.extend(label.numpy())
probs.extend(F.sigmoid(logits).numpy())
probs = np.array(probs)
Expand Down
87 changes: 39 additions & 48 deletions applications/text_classification/hierarchical/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,19 @@

import os
import argparse

import functools
import numpy as np

import paddle
import paddle.nn.functional as F
from paddlenlp.utils.log import logger
from paddlenlp.data import Tuple, Pad
from paddle.io import DataLoader, BatchSampler
from paddlenlp.data import DataCollatorWithPadding
from paddlenlp.datasets import load_dataset
from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer

from utils import preprocess_function, read_local_dataset

# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument('--device', default="gpu", help="Select which device to train model, defaults to gpu.")
Expand All @@ -37,42 +41,47 @@


@paddle.no_grad()
def predict(data, label_list):
def predict():
"""
Predicts the data labels.
Args:
data (obj:`List`): The processed data whose each element is one sequence.
label_map(obj:`List`): The label id (key) to label str (value) map.
Predicts the data labels.
"""
paddle.set_device(args.device)
model = AutoModelForSequenceClassification.from_pretrained(args.params_path)
tokenizer = AutoTokenizer.from_pretrained(args.params_path)

examples = []
for text in data:
result = tokenizer(text=text, max_seq_len=args.max_seq_length)
examples.append((result['input_ids'], result['token_type_ids']))
label_list = []
label_path = os.path.join(args.dataset_dir, args.label_file)
with open(label_path, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
label_list.append(line.strip())

data_ds = load_dataset(read_local_dataset,
path=os.path.join(args.dataset_dir, args.data_file),
is_test=True,
lazy=False)

trans_func = functools.partial(preprocess_function,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
label_nums=len(label_list),
is_test=True)

# Seperates data into some batches.
batches = [
examples[i:i + args.batch_size]
for i in range(0, len(examples), args.batch_size)
]
data_ds = data_ds.map(trans_func)

batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment
): fn(samples)
# batchify dataset
collate_fn = DataCollatorWithPadding(tokenizer)
data_batch_sampler = BatchSampler(data_ds,
batch_size=args.batch_size,
shuffle=False)

data_data_loader = DataLoader(dataset=data_ds,
batch_sampler=data_batch_sampler,
collate_fn=collate_fn)

results = []
model.eval()
for batch in batches:
input_ids, token_type_ids = batchify_fn(batch)
input_ids = paddle.to_tensor(input_ids)
token_type_ids = paddle.to_tensor(token_type_ids)
logits = model(input_ids, token_type_ids)
for batch in data_data_loader:
logits = model(**batch)
probs = F.sigmoid(logits).numpy()
for prob in probs:
labels = []
Expand All @@ -81,9 +90,9 @@ def predict(data, label_list):
labels.append(label_list[i])
results.append(labels)

for text, labels in zip(data, results):
for t, labels in zip(data_ds.data, results):
hierarchical_labels = {}
logger.info("text: {}".format(text))
logger.info("text: {}".format(t["sentence"]))
logger.info("prediction result: {}".format(",".join(labels)))
for label in labels:
for i, l in enumerate(label.split('##')):
Expand All @@ -100,22 +109,4 @@ def predict(data, label_list):

if __name__ == "__main__":

data_dir = os.path.join(args.dataset_dir, args.data_file)
label_dir = os.path.join(args.dataset_dir, args.label_file)

data = []
label_list = []

with open(data_dir, 'r', encoding='utf-8') as f:
lines = f.readlines()
for i, line in enumerate(lines):
data.append(line.strip())
f.close()

with open(label_dir, 'r', encoding='utf-8') as f:
lines = f.readlines()
for i, line in enumerate(lines):
label_list.append(line.strip())
f.close()

predict(data, label_list)
predict()
8 changes: 3 additions & 5 deletions applications/text_classification/hierarchical/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
parser.add_argument("--save_dir", default="./checkpoint", type=str, help="The output directory where the model checkpoints will be written.")
parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument('--model_name', default="ernie-3.0-medium-zh", help="Select model to train, defaults to ernie-3.0-medium-zh.",
choices=["ernie-3.0-xbase-zh", "ernie-3.0-base-zh", "ernie-3.0-medium-zh", "ernie-3.0-micro-zh", "ernie-3.0-mini-zh", "ernie-3.0-nano-zh", "ernie-2.0-base-en", "ernie-2.0-large-en"])
choices=["ernie-3.0-xbase-zh", "ernie-3.0-base-zh", "ernie-3.0-medium-zh", "ernie-3.0-micro-zh", "ernie-3.0-mini-zh", "ernie-3.0-nano-zh", "ernie-2.0-base-en", "ernie-2.0-large-en","ernie-m-base","ernie-m-large"])
parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--epochs", default=100, type=int, help="Total number of training epochs to perform.")
Expand Down Expand Up @@ -178,10 +178,8 @@ def train():

for step, batch in enumerate(train_data_loader, start=1):

input_ids, token_type_ids, labels = batch['input_ids'], batch[
'token_type_ids'], batch['labels']

logits = model(input_ids, token_type_ids)
labels = batch.pop("labels")
logits = model(**batch)
loss = criterion(logits, labels)

probs = F.sigmoid(logits)
Expand Down
37 changes: 23 additions & 14 deletions applications/text_classification/hierarchical/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,8 @@ def evaluate(model, criterion, metric, data_loader):
metric.reset()
losses = []
for batch in data_loader:
input_ids, token_type_ids, labels = batch['input_ids'], batch[
'token_type_ids'], batch['labels']
logits = model(input_ids, token_type_ids)
labels = batch.pop("labels")
logits = model(**batch)
loss = criterion(logits, labels)
probs = F.sigmoid(logits)
losses.append(loss.numpy())
Expand All @@ -51,7 +50,11 @@ def evaluate(model, criterion, metric, data_loader):
return micro_f1_score, macro_f1_score


def preprocess_function(examples, tokenizer, max_seq_length, label_nums):
def preprocess_function(examples,
tokenizer,
max_seq_length,
label_nums,
is_test=False):
"""
Builds model inputs from a sequence for sequence classification tasks
by concatenating and adding special tokens.
Expand All @@ -68,21 +71,27 @@ def preprocess_function(examples, tokenizer, max_seq_length, label_nums):
"""
result = tokenizer(text=examples["sentence"], max_seq_len=max_seq_length)
# One-Hot label
result["labels"] = [
float(1) if i in examples["label"] else float(0)
for i in range(label_nums)
]
if not is_test:
result["labels"] = [
float(1) if i in examples["label"] else float(0)
for i in range(label_nums)
]
return result


def read_local_dataset(path, label_list):
def read_local_dataset(path, label_list=None, is_test=False):
"""
Read dataset
"""
with open(path, 'r', encoding='utf-8') as f:
for line in f:
items = line.strip().split('\t')
sentence = ''.join(items[:-1])
label = items[-1]
labels = [label_list[l] for l in label.split(',')]
yield {'sentence': sentence, 'label': labels}
if is_test:
items = line.strip().split('\t')
sentence = ''.join(items)
yield {'sentence': sentence}
else:
items = line.strip().split('\t')
sentence = ''.join(items[:-1])
label = items[-1]
labels = [label_list[l] for l in label.split(',')]
yield {'sentence': sentence, 'label': labels}
Loading

0 comments on commit ab2bd21

Please sign in to comment.