Skip to content

Commit 3407b91

Browse files
authored
Merge pull request #31 from microsoft/download_cmip6
Download and preprocess CMIP6 data
2 parents 5533b8c + 3f97ae7 commit 3407b91

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1049
-2
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -133,3 +133,6 @@ dmypy.json
133133

134134
# experiments
135135
exps
136+
137+
# snakemake logs
138+
.snakemake

docs/usage.md

+15-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,21 @@
44

55
### Data Preparation
66

7-
The code for downloading and preprocessing CMIP6 data is coming soon
7+
First install `snakemake` following [these instructions](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html)
8+
9+
To download and regrid a CMIP6 dataset to a common resolution (e.g., 1.406525 degree), go to the corresponding directory inside `snakemake_configs` and run
10+
```bash
11+
snakemake all --configfile config_2m_temperature.yml --cores 8
12+
```
13+
This script will download and regrid the `2m_temperature` data in parallel using 8 CPU cores. Modify `configfile` for other variables. After downloading and regrdding, run the following script to preprocess the `.nc` files into `.npz` format for pretraining ClimaX
14+
```bash
15+
python src/data_preprocessing/nc2np_equally_cmip6.py \
16+
--dataset mpi
17+
--path /data/CMIP6/MPI-ESM/1.40625deg/
18+
--num_shards 10
19+
--save_dir /data/CMIP6/MPI-ESM/1.40625deg_np_10shards
20+
```
21+
in which `num_shards` denotes the number of chunks to break each `.nc` file into.
822

923
### Training
1024

snakemake_configs/AWI-ESM/Snakefile

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
2+
year_strings = [f'{y}01010600-{y+1}01010000' for y in range(1850, 2015, 1)]
3+
4+
print(config)
5+
6+
rule download:
7+
output:
8+
"{dataset}/raw/{name}/{name}_{year_str}_raw.nc",
9+
shell:
10+
"wget https://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/AWI/AWI-ESM-1-1-LR/historical/{config["
11+
"run]}/6hrPlevPt/"
12+
"{config[cmip_name]}/gn/v20200212/"
13+
"{config[cmip_name]}_6hrPlevPt_AWI-ESM-1-1-LR_historical_{config[run]}_gn_{wildcards.year_str}.nc "
14+
"-O {wildcards.dataset}/raw/{config[name]}/{config[name]}_{wildcards.year_str}_raw.nc"
15+
16+
rule regrid:
17+
input:
18+
"{dataset}/raw/{name}/{name}_{year_str}_raw.nc"
19+
output:
20+
"{dataset}/{res}deg/{name}/{name}_{year_str}_{res}deg.nc.tmp"
21+
shell:
22+
"python ../../src/data_preprocessing/regrid.py \
23+
--input_fns {input} \
24+
--output_dir {wildcards.dataset}/{wildcards.res}deg/{wildcards.name} \
25+
--ddeg_out {wildcards.res} \
26+
--cmip 1 \
27+
--rename {config[cmip_name]} {config[era_name]} \
28+
--file_ending nc.tmp"
29+
30+
rule delete:
31+
input:
32+
expand("{{dataset}}/{res}deg/{{name}}/{{name}}_{{year_str}}_{res}deg.nc.tmp",
33+
res=config['res']),
34+
output:
35+
expand("{{dataset}}/{res}deg/{{name}}/{{name}}_{{year_str}}_{res}deg.nc",
36+
res=config['res'])
37+
priority: 100
38+
run:
39+
for i, o in zip(input, output):
40+
shell("mv {i} {o}")
41+
# shell("rm {wildcards.dataset}/raw/{wildcards.name}/{wildcards.name}_{wildcards.year_str}_raw.nc"),
42+
43+
44+
rule all:
45+
input:
46+
expand("{datadir}/{res}deg/{name}/{name}_{year_str}_{res}deg.nc",
47+
datadir=config['datadir'], res=config['res'], name=config['name'], year_str=year_strings)
48+
49+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
datadir: /data/CMIP6/AWI-ESM
2+
name: 10m_u_component_of_wind
3+
cmip_name: uas
4+
era_name: u10
5+
run: r1i1p1f1
6+
res:
7+
- 1.40625
8+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
datadir: /data/CMIP6/AWI-ESM
2+
name: 10m_v_component_of_wind
3+
cmip_name: vas
4+
era_name: v10
5+
run: r1i1p1f1
6+
res:
7+
- 1.40625
8+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
datadir: /data/CMIP6/AWI-ESM
2+
name: 2m_temperature
3+
cmip_name: tas
4+
era_name: t2m
5+
run: r1i1p1f1
6+
res:
7+
- 1.40625
8+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
datadir: /data/CMIP6/AWI-ESM
2+
name: geopotential
3+
cmip_name: zg
4+
era_name: z
5+
run: r1i1p1f1
6+
res:
7+
- 1.40625
8+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
datadir: /data/CMIP6/AWI-ESM
2+
name: specific_humidity
3+
cmip_name: hus
4+
era_name: q
5+
run: r1i1p1f1
6+
res:
7+
- 1.40625
8+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
datadir: /data/CMIP6/AWI-ESM
2+
name: temperature
3+
cmip_name: ta
4+
era_name: t
5+
run: r1i1p1f1
6+
res:
7+
- 1.40625
8+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
datadir: /data/CMIP6/AWI-ESM
2+
name: u_component_of_wind
3+
cmip_name: ua
4+
era_name: u
5+
run: r1i1p1f1
6+
res:
7+
- 1.40625
8+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
datadir: /data/CMIP6/AWI-ESM
2+
name: v_component_of_wind
3+
cmip_name: va
4+
era_name: v
5+
run: r1i1p1f1
6+
res:
7+
- 1.40625
8+
# - 5.625

snakemake_configs/CMCC/Snakefile

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
year_strings = [f'{y}01010600-{y+1}01010000' for y in range(1850, 2015, 1)]
2+
3+
print(config)
4+
5+
rule download:
6+
output:
7+
"{dataset}/raw/{name}/{name}_{year_str}_raw.nc",
8+
shell:
9+
"wget https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/CMIP/CMCC/CMCC-CM2-HR4/historical/{config["
10+
"run]}/6hrPlevPt/"
11+
"{config[cmip_name]}/gn/v20200904/"
12+
"{config[cmip_name]}_6hrPlevPt_CMCC-CM2-HR4_historical_{config[run]}_gn_{wildcards.year_str}.nc "
13+
"-O {wildcards.dataset}/raw/{config[name]}/{config[name]}_{wildcards.year_str}_raw.nc"
14+
15+
# https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/CMIP/CMCC/CMCC-CM2-HR4/historical/r1i1p1f1/6hrPlevPt/ta/gn/v20200904/ta_6hrPlevPt_CMCC-CM2-HR4_historical_r1i1p1f1_gn_185001010600-185101010000.nc
16+
17+
rule regrid:
18+
input:
19+
"{dataset}/raw/{name}/{name}_{year_str}_raw.nc"
20+
output:
21+
"{dataset}/{res}deg/{name}/{name}_{year_str}_{res}deg.nc.tmp"
22+
shell:
23+
"python ../../src/data_preprocessing/regrid.py \
24+
--input_fns {input} \
25+
--output_dir {wildcards.dataset}/{wildcards.res}deg/{wildcards.name} \
26+
--ddeg_out {wildcards.res} \
27+
--cmip 1 \
28+
--rename {config[cmip_name]} {config[era_name]} \
29+
--file_ending nc.tmp"
30+
31+
rule delete:
32+
input:
33+
expand("{{dataset}}/{res}deg/{{name}}/{{name}}_{{year_str}}_{res}deg.nc.tmp",
34+
res=config['res']),
35+
output:
36+
expand("{{dataset}}/{res}deg/{{name}}/{{name}}_{{year_str}}_{res}deg.nc",
37+
res=config['res'])
38+
priority: 100
39+
run:
40+
for i, o in zip(input, output):
41+
shell("mv {i} {o}")
42+
# shell("rm {wildcards.dataset}/raw/{wildcards.name}/{wildcards.name}_{wildcards.year_str}_raw.nc"),
43+
44+
45+
rule all:
46+
input:
47+
expand("{datadir}/{res}deg/{name}/{name}_{year_str}_{res}deg.nc",
48+
datadir=config['datadir'], res=config['res'], name=config['name'], year_str=year_strings)
49+
50+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
datadir: /data/CMIP6/CMCC
2+
name: geopotential
3+
cmip_name: zg
4+
era_name: z
5+
run: r1i1p1f1
6+
res:
7+
- 1.40625
8+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
datadir: /data/CMIP6/CMCC
2+
name: temperature
3+
cmip_name: ta
4+
era_name: t
5+
run: r1i1p1f1
6+
res:
7+
- 1.40625
8+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
datadir: /data/CMIP6/CMCC
2+
name: u_component_of_wind
3+
cmip_name: ua
4+
era_name: u
5+
run: r1i1p1f1
6+
res:
7+
- 1.40625
8+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
datadir: /data/CMIP6/CMCC
2+
name: v_component_of_wind
3+
cmip_name: va
4+
era_name: v
5+
run: r1i1p1f1
6+
res:
7+
- 1.40625
8+
# - 5.625

snakemake_configs/HAMMOZ/Snakefile

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
2+
year_strings = [
3+
'185001010600-187001010000',
4+
'187001010600-189001010000',
5+
'189001010600-191001010000',
6+
'191001010600-193001010000',
7+
'193001010600-195001010000',
8+
'195001010600-197001010000',
9+
'197001010600-199001010000',
10+
'199001010600-201001010000',
11+
'201001010600-201501010000',
12+
]
13+
14+
print(config)
15+
16+
rule download:
17+
output:
18+
"{dataset}/raw/{name}/{name}_{year_str}_raw.nc",
19+
shell:
20+
"wget https://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/HAMMOZ-Consortium/MPI-ESM-1-2-HAM/historical/{config["
21+
"run]}/6hrPlevPt/"
22+
"{config[cmip_name]}/gn/{config[version]}/"
23+
"{config[cmip_name]}_6hrPlevPt_MPI-ESM-1-2-HAM_historical_{config[run]}_gn_{wildcards.year_str}.nc "
24+
"-O {wildcards.dataset}/raw/{config[name]}/{config[name]}_{wildcards.year_str}_raw.nc"
25+
26+
rule regrid:
27+
input:
28+
"{dataset}/raw/{name}/{name}_{year_str}_raw.nc"
29+
output:
30+
"{dataset}/{res}deg/{name}/{name}_{year_str}_{res}deg.nc.tmp"
31+
shell:
32+
"python ../../src/data_preprocessing/regrid.py \
33+
--input_fns {input} \
34+
--output_dir {wildcards.dataset}/{wildcards.res}deg/{wildcards.name} \
35+
--ddeg_out {wildcards.res} \
36+
--cmip 1 \
37+
--rename {config[cmip_name]} {config[era_name]} \
38+
--file_ending nc.tmp"
39+
40+
rule delete:
41+
input:
42+
expand("{{dataset}}/{res}deg/{{name}}/{{name}}_{{year_str}}_{res}deg.nc.tmp",
43+
res=config['res']),
44+
output:
45+
expand("{{dataset}}/{res}deg/{{name}}/{{name}}_{{year_str}}_{res}deg.nc",
46+
res=config['res'])
47+
priority: 100
48+
run:
49+
for i, o in zip(input, output):
50+
shell("mv {i} {o}")
51+
# shell("rm {wildcards.dataset}/raw/{wildcards.name}/{wildcards.name}_{wildcards.year_str}_raw.nc"),
52+
53+
54+
rule all:
55+
input:
56+
expand("{datadir}/{res}deg/{name}/{name}_{year_str}_{res}deg.nc",
57+
datadir=config['datadir'], res=config['res'], name=config['name'], year_str=year_strings)
58+
59+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
datadir: /data/CMIP6/HAMMOZ
2+
name: 10m_u_component_of_wind
3+
cmip_name: uas
4+
era_name: u10
5+
run: r1i1p1f1
6+
version: v20190627
7+
res:
8+
- 1.40625
9+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
datadir: /data/CMIP6/HAMMOZ
2+
name: 10m_v_component_of_wind
3+
cmip_name: vas
4+
era_name: v10
5+
run: r1i1p1f1
6+
version: v20190627
7+
res:
8+
- 1.40625
9+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
datadir: /data/CMIP6/HAMMOZ
2+
name: 2m_temperature
3+
cmip_name: tas
4+
era_name: t2m
5+
run: r1i1p1f1
6+
version: v20190628
7+
res:
8+
- 1.40625
9+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
datadir: /data/CMIP6/HAMMOZ
2+
name: geopotential
3+
cmip_name: zg
4+
era_name: z
5+
run: r1i1p1f1
6+
version: v20190628
7+
res:
8+
- 1.40625
9+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
datadir: /data/CMIP6/HAMMOZ
2+
name: specific_humidity
3+
cmip_name: hus
4+
era_name: q
5+
run: r1i1p1f1
6+
version: v20190628
7+
res:
8+
- 1.40625
9+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
datadir: /data/CMIP6/HAMMOZ
2+
name: temperature
3+
cmip_name: ta
4+
era_name: t
5+
run: r1i1p1f1
6+
version: v20190628
7+
res:
8+
- 1.40625
9+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
datadir: /data/CMIP6/HAMMOZ
2+
name: u_component_of_wind
3+
cmip_name: ua
4+
era_name: u
5+
run: r1i1p1f1
6+
version: v20190628
7+
res:
8+
- 1.40625
9+
# - 5.625
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
datadir: /data/CMIP6/HAMMOZ
2+
name: v_component_of_wind
3+
cmip_name: va
4+
era_name: v
5+
run: r1i1p1f1
6+
version: v20190628
7+
res:
8+
- 1.40625
9+
# - 5.625

0 commit comments

Comments
 (0)