8
8
9
9
10
10
_rendezvous_handlers = {}
11
+ _default_store_timeout = timedelta (minutes = 5 )
11
12
12
13
13
14
def register_rendezvous_handler (scheme , handler ):
@@ -53,7 +54,7 @@ def _rendezvous_error(msg):
53
54
return ValueError ("Error initializing torch.distributed using " + msg )
54
55
55
56
56
- def _file_rendezvous_handler (url ):
57
+ def _file_rendezvous_handler (url , timeout = _default_store_timeout ):
57
58
def _error (msg ):
58
59
return _rendezvous_error ("file:// rendezvous: " + msg )
59
60
@@ -69,14 +70,14 @@ def _error(msg):
69
70
70
71
rank = int (query ["rank" ])
71
72
world_size = int (query ["world_size" ])
72
- store = FileStore (path , world_size )
73
+ store = FileStore (path , world_size , timeout )
73
74
yield (store , rank , world_size )
74
75
75
76
# If this configuration is invalidated, there is nothing we can do about it
76
77
raise RuntimeError ("Unable to perform rerendezvous using file:// method" )
77
78
78
79
79
- def _tcp_rendezvous_handler (url ):
80
+ def _tcp_rendezvous_handler (url , timeout = _default_store_timeout ):
80
81
def _error (msg ):
81
82
return _rendezvous_error ("tcp:// rendezvous: " + msg )
82
83
@@ -92,14 +93,14 @@ def _error(msg):
92
93
rank = int (query ["rank" ])
93
94
world_size = int (query ["world_size" ])
94
95
start_daemon = rank == 0
95
- store = TCPStore (result .hostname , result .port , world_size , start_daemon )
96
+ store = TCPStore (result .hostname , result .port , world_size , start_daemon , timeout )
96
97
yield (store , rank , world_size )
97
98
98
99
# If this configuration is invalidated, there is nothing we can do about it
99
100
raise RuntimeError ("Unable to perform rerendezvous using tcp:// method" )
100
101
101
102
102
- def _env_rendezvous_handler (url ):
103
+ def _env_rendezvous_handler (url , timeout = _default_store_timeout ):
103
104
def _error (msg ):
104
105
return _rendezvous_error ("env:// rendezvous: " + msg )
105
106
@@ -140,7 +141,7 @@ def _env_error(var):
140
141
141
142
# Now start the TCP store daemon on the rank 0
142
143
start_daemon = rank == 0
143
- store = TCPStore (master_addr , master_port , world_size , start_daemon )
144
+ store = TCPStore (master_addr , master_port , world_size , start_daemon , timeout )
144
145
yield (store , rank , world_size )
145
146
146
147
# If this configuration is invalidated, there is nothing we can do about it
0 commit comments