-
Notifications
You must be signed in to change notification settings - Fork 96
/
Copy pathcoyo_1m_dataset_preprocess.py
131 lines (109 loc) · 3.9 KB
/
coyo_1m_dataset_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import argparse
import logging
import random
import cv2
import jsonlines
import numpy as np
import requests
from datasets import load_dataset
from PIL import Image
logger = logging.getLogger(__name__)
def parse_args():
parser = argparse.ArgumentParser(
description="Example of a data preprocessing script."
)
parser.add_argument(
"--train_data_dir",
type=str,
required=True,
help="The directory to store the dataset",
)
parser.add_argument(
"--cache_dir",
type=str,
required=True,
help="The directory to store cache",
)
parser.add_argument(
"--max_train_samples",
type=int,
default=None,
help="number of examples in the dataset",
)
parser.add_argument(
"--num_proc",
type=int,
default=1,
help="number of processors to use in `dataset.map()`",
)
args = parser.parse_args()
return args
# filter for `max_train_samples``
def filter_function(example):
if example["clip_similarity_vitb32"] < 0.3:
return False
if example["watermark_score"] > 0.4:
return False
if example["aesthetic_score_laion_v2"] < 6.0:
return False
return True
def filter_dataset(dataset, max_train_samples):
small_dataset = dataset.select(range(max_train_samples)).filter(filter_function)
return small_dataset
if __name__ == "__main__":
args = parse_args()
# load coyo-700
dataset = load_dataset(
"kakaobrain/coyo-700m",
cache_dir=args.cache_dir,
split="train",
)
# estimation the % of images filtered
filter_ratio = len(filter_dataset(dataset, 20000)) / 20000
# esimate max_train_samples based on
# (1) filter_ratio we calculuted with 20k examples
# (2) assumption that only 80% of the URLs are still valid
max_train_samples = int(args.max_train_samples / filter_ratio / 0.8)
# filter dataset down to 1 million
small_dataset = filter_dataset(dataset, max_train_samples)
def preprocess_and_save(example):
image_url = example["url"]
try:
# download original image
image = Image.open(requests.get(image_url, stream=True, timeout=5).raw)
image_path = f"{args.train_data_dir}/images/{example['id']}.png"
image.save(image_path)
# generate and save canny image
processed_image = np.array(image)
# apply random threholds
# note that this should normally be applied on the fly during training.
# But that's fine when dealing with a larger dataset like here.
threholds = (
random.randint(0, 255),
random.randint(0, 255),
)
processed_image = cv2.Canny(processed_image, min(threholds), max(threholds))
processed_image = processed_image[:, :, None]
processed_image = np.concatenate(
[processed_image, processed_image, processed_image], axis=2
)
processed_image = Image.fromarray(processed_image)
processed_image_path = (
f"{args.train_data_dir}/processed_images/{example['id']}.png"
)
processed_image.save(processed_image_path)
# write to meta.jsonl
meta = {
"image": image_path,
"conditioning_image": processed_image_path,
"caption": example["text"],
}
with jsonlines.open(
f"{args.train_data_dir}/meta.jsonl", "a"
) as writer: # for writing
writer.write(meta)
except Exception as e:
logger.error(f"Failed to process image{image_url}: {str(e)}")
# preprocess -> image, processed image and meta.jsonl
small_dataset.map(preprocess_and_save, num_proc=args.num_proc)
print(f"created data folder at: {args.train_data_dir}")