forked from pytorch/examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_distributed_examples.sh
executable file
·96 lines (83 loc) · 2.06 KB
/
run_distributed_examples.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env bash
#
# This script runs through the code in each of the python examples.
# The purpose is just as an integration test, not to actually train models in any meaningful way.
# For that reason, most of these set epochs = 1 and --dry-run.
#
# Optionally specify a comma separated list of examples to run.
# can be run as:
# ./run_python_examples.sh "install_deps,run_all,clean"
# to pip install dependencies (other than pytorch), run all examples, and remove temporary/changed data files.
# Expects pytorch, torchvision to be installed.
BASE_DIR=`pwd`"/"`dirname $0`
EXAMPLES=`echo $1 | sed -e 's/ //g'`
# Redirect 'python' calls to 'python3'
python() {
command python3 "$@"
}
USE_CUDA=$(python -c "import torch; print(torch.cuda.is_available())")
case $USE_CUDA in
"True")
echo "using cuda"
CUDA=1
CUDA_FLAG="--cuda"
;;
"False")
echo "not using cuda"
CUDA=0
CUDA_FLAG=""
;;
"")
exit 1;
;;
esac
ERRORS=""
function error() {
ERR=$1
ERRORS="$ERRORS\n$ERR"
echo $ERR
}
function install_deps() {
echo "installing requirements"
cat $BASE_DIR/*/requirements.txt | \
sort -u | \
# testing the installed version of torch, so don't pip install it.
grep -vE '^torch$' | \
pip install -r /dev/stdin || \
{ error "failed to install dependencies"; exit 1; }
}
function start() {
EXAMPLE=${FUNCNAME[1]}
cd $BASE_DIR/$EXAMPLE
echo "Running example: $EXAMPLE"
}
function distributed() {
start
torchrun --standalone --nnodes=1 --nproc_per_node=4 tensor_parallelism/fsdp_tp_example.py
}
function clean() {
cd $BASE_DIR
echo "running clean to remove cruft"
}
function run_all() {
distributed
}
# by default, run all examples
if [ "" == "$EXAMPLES" ]; then
run_all
else
for i in $(echo $EXAMPLES | sed "s/,/ /g")
do
echo "Starting $i"
$i
echo "Finished $i, status $?"
done
fi
if [ "" == "$ERRORS" ]; then
echo "Completed successfully with status $?"
else
echo "Some examples failed:"
printf "$ERRORS"
#Exit with error (0-255) in case of failure in one of the tests.
exit 1
fi