-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredict.Dockerfile
61 lines (48 loc) · 3.2 KB
/
predict.Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
ARG PROJECT_ID=latest
ARG BASE_VERSION=latest
FROM gcr.io/${PROJECT_ID}/bluebert-base:${BASE_VERSION}
ARG TUNED_MODEL_VERSION=latest
# Note: TASK_NAME must align with names used by the prob2label.py script
# and also must align with the data directory structure, e.g. bl_chemical_to_gene
ARG TASK_NAME=latest
# MODEL_STORAGE_BUCKET is the GCP bucket where the trained model will be downloaded from, e.g. gs://a/b/c/
ARG MODEL_STORAGE_BUCKET=latest
# CLASSIFICATION_LABELS is a tab-delimited string of the possible classification labels for the task, e.g. 'treats false'
ARG CLASSIFICATION_LABELS=latest
COPY scripts/predict.entrypoint.sh /home/dev/entrypoint.sh
ENV BlueBERT_DIR '/home/dev/models/tuned'
WORKDIR /home/dev/models/tuned
# download the model
ENV TUNED_MODEL_VERSION_ENV=$TUNED_MODEL_VERSION
ENV TASK_NAME_ENV=$TASK_NAME
ENV MODEL_STORAGE_BUCKET_ENV=$MODEL_STORAGE_BUCKET
ENV CLASSIFICATION_LABELS_ENV=$CLASSIFICATION_LABELS
RUN wget "https://storage.googleapis.com/${MODEL_STORAGE_BUCKET_ENV}/bert/${TASK_NAME_ENV}/${TASK_NAME_ENV}.${TUNED_MODEL_VERSION_ENV}.tar.gz" && \
tar -xzf "${TASK_NAME_ENV}.${TUNED_MODEL_VERSION_ENV}.tar.gz"
ENTRYPOINT /home/dev/entrypoint.sh "${TASK_NAME_ENV}" "${CLASSIFICATION_LABELS_ENV}" "${TUNED_MODEL_VERSION_ENV}" "$@"
# To build:
# docker build --build-arg "PROJECT_ID=[PROJECT_ID]" \
# --build-arg "BASE_VERSION=[BASE_VERSION]" \
# --build-arg "TASK_NAME=[TASK_NAME]" \
# --build-arg "TUNED_MODEL_VERSION=[TUNED_MODEL_VERSION]" \
# --build-arg "MODEL_STORAGE_BUCKET=[MODEL_STORAGE_BUCKET]" \
# --build-arg "CLASSIFICATION_LABELS=[CLASSIFICATION_LABELS]" \
# -t task-name:0.1 -f predict.Dockerfile
#
# where:
# [PROJECT_ID] = the id for this project - it is used to retrieve the already built base image (from base.Dockerfile)
# [BASE_VERSION] = the version of the base container to use
# [TASK_NAME] = the name of the task that the model is being trained for. TASK_NAME must align with names used by the prob2label.py script
# and also must align with the data directory structure, e.g. bl_chemical_to_gene
# [TUNED_MODEL_VERSION] = the version of the model being trained. This version will be used in the exported model file name.
# [MODEL_STORAGE_BUCKET] = the Google Cloud Storage bucket where the tuned-model is located
# [CLASSIFICATION_LABELS] = the classification labels that will be used as part of the file header for the output file, e.g. for the `bl_chemical_to_disease` task, the classification labels string should be "`treats false`".
# To run:
# docker run --rm [IMAGE_NAME]:[IMAGE_VERSION] [SENTENCE_BUCKET] [COLLECTION] [OUTPUT_BUCKET]
#
# where:
# [IMAGE_NAME] = the name of the Docker image - which will be the same as the TASK_NAME
# [IMAGE_VERSION] = the version of the Docker image - which will be the same as the TUNED_MODEL_VERSION
# [SENTENCE_BUCKET] = the full GCP path to where the TSV to-be-classified sentence files are located, e.g. gs://xyz/sentences/chemical-disease/
# [COLLECTION] = the name of the collection being processed, e.g. PUBMED_SUB_31, 2021_06_08
# [OUTPUT_BUCKET] = the output bucket where classified sentences will be placed, e.g. gs://xyz