-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathQPlane.py
329 lines (281 loc) · 14.7 KB
/
QPlane.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
import socket
import time
import os
import numpy as np
import matplotlib.pyplot as plt
from src.algorithms.QDoubleDeepLearn import QLearn # can be QLearn, QDeepLearn, QDoubleDeepLearn or RandomAgent
from src.environments.jsbsim.JSBSimEnv import Env # can be jsbsim.JSBSimEnv or xplane.XPlaneEnv
from src.scenarios.deltaAttitudeControlScene import Scene # can be deltaAttitudeControlScene, sparseAttitudeControlScene or cheatingAttitudeControlScene
experimentName = "Experiment"
connectAttempts = 0.0 # counts everytime the UDP packages are lost on a single retry
notes = "This experiment was run with..." # add notes that will be saved to the setup file to clearify the experiment setup better
dateTime = str(time.ctime(time.time()))
dateTime = dateTime.replace(":", "-")
dateTime = dateTime.replace(" ", "_")
experimentName = experimentName + "-" + dateTime
errors = 0.0 # counts everytime the UDP packages are lost on all retries
timeStart = time.time() # used to measure time
timeEnd = time.time() # used to measure time
logPeriod = 100 # every so many epochs the metrics will be printed into the console
savePeriod = 25 # every so many epochs the table/model will be saved to a file
movingRate = 1 # Is multiplied with savePeriod The rate at which the metrics will be averaged and saved and plotted.
pauseDelay = 0.1 # time an action is being applied to the environment
logDecimals = 0 # sets decimals for np.arrays to X for printing
np.set_printoptions(precision=logDecimals) # sets decimals for np.arrays to X for printing
n_epochs = 50_000 # Number of generations
n_steps = 1_000 # Number of inputs per generation
n_actions = 4 # Number of possible inputs to choose from
n_states = 182 # Number of states for non-Deep QLearning
gamma = 0.75 # The discount rate - between 0 an 1! if = 0 then no learning, ! The higher it is the more the new q will factor into the update of the q value
lr = 0.0001 # Learning Rate. Deep ~0.0001 / non-Deep ~0.01 - If LR is 0 then the Q value would not update. The higher the value the quicker the agent will adopt the NEW Q value. If lr = 1, the updated value would be exactly be the newly calculated q value, completely ignoring the previous one
epsilon = 1.0 # Starting Epsilon Rate, affects the exploration probability. Will decay
decayRate = 0.00001 # Rate at which epsilon will decay per step
epsilonMin = 0.1 # Minimum value at which epsilon will stop decaying
n_epochsBeforeDecay = 10 # number of games to be played before epsilon starts to decay
numOfInputs = 7 # Number of inputs fed to the model
stateDepth = 1 # Number of old observations kept for current state. State will consist of s(t) ... s(t_n)
minReplayMemSize = 1_000 # min size determines when the replay will start being used
replayMemSize = 100_000 # Max size for the replay buffer
batchSize = 256 # Batch size for the model
updateRate = 5 # update target model every so many episodes
startingOffset = 0 # is used if previous Results are loaded.
loadModel = False # will load "model.h5" for tf if True (model.npy for non-Deep)
loadMemory = False # will load "memory.pickle" if True
loadResults = False # will load "results.npy" if True
jsbRender = False # will send UDP data to flight gear for rendering if True
jsbRealTime = False # will slow down the physics to portrait real time rendering
usePredefinedSeeds = False # Sets seeds for tf, np and random for more replicable results (not fully replicable due to stochastic environments)
saveResultsToPlot = True # Saves results to png in the experiment folder at runetime
saveForAutoReload = False # Saves and overrides models, results and memory to the root
startingVelocity = 60
startingPitchRange = 10
startingRollRange = 15
randomDesiredState = True # Set a new state to stabalize towards every episode
desiredPitchRange = 5
desiredRollRange = 5
dictObservation = {
"lat": 0,
"long": 1,
"alt": 2,
"pitch": 3,
"roll": 4,
"yaw": 5,
"gear": 6}
dictAction = {
"pi+": 0,
"pi-": 1,
"ro+": 2,
"ro-": 3,
"ru+": 4,
"ru-": 5,
"no": 6}
dictErrors = {
"reset": 0,
"update": 0,
"step": 0}
dictRotation = {
"roll": 0,
"pitch": 1,
"yaw": 2,
"northVelo": 3,
"eastVelo": 4,
"verticalVelo": 5}
# -998->NO CHANGE
flightOrigin = [35.126, 126.809, 6000, 0, 0, 0, 1] # Gwangju SK
flightDestinaion = [33.508, 126.487, 6000, -998, -998, -998, 1] # Jeju SK
# Other locations to use: Memmingen: [47.988, 10.240], Chicago: [41.976, -87.902]
epochRewards = []
epochQs = []
movingRate = savePeriod * movingRate # gives the number by which the moving average will be done, best if n * savePeriod
movingEpRewards = {
"epoch": [],
"average": [],
"minimum": [],
"maximum": [],
"averageQ": [],
"epsilon": []}
fallbackState = [0] * numOfInputs # Used in case of connection error to XPlane
fallbackState = [tuple(fallbackState)]
# Will load previous results in case a experiment needs to be continued
if(loadResults):
movingEpRewards = np.load("results.npy", allow_pickle=True).item() # loads the file - .item() turns the loaded nparray back to a dict
startingOffset = np.max(movingEpRewards["epoch"]) # loads the episode where it previously stopped
epsilon = np.min(movingEpRewards["epsilon"]) # loads the epsilon where the previously experiment stopped
n_epochsBeforeDecay = max(0, n_epochsBeforeDecay - startingOffset) # sets n_epochsBeforeDecay to the according value - max makes it so it's not negative but 0
if(usePredefinedSeeds):
np.random.seed(42)
Q = QLearn(n_states, n_actions, gamma, lr, epsilon,
decayRate, epsilonMin, n_epochsBeforeDecay, experimentName, saveForAutoReload, loadModel, usePredefinedSeeds,
loadMemory, numOfInputs, minReplayMemSize, replayMemSize, batchSize, updateRate, stateDepth)
scene = Scene(dictObservation, dictAction, n_actions, stateDepth, startingVelocity, startingPitchRange, startingRollRange, usePredefinedSeeds, randomDesiredState, desiredPitchRange, desiredRollRange)
env = Env(scene, flightOrigin, flightDestinaion, n_actions, usePredefinedSeeds,
dictObservation, dictAction, dictRotation, startingVelocity, pauseDelay, Q.id, jsbRender, jsbRealTime)
# saving setup pre run
if not os.path.exists("./Experiments/" + experimentName):
os.makedirs("./Experiments/" + experimentName)
setup = f"{experimentName=}\n{Q.numGPUs=}\n{dateTime=}\nendTime=not yet defined - first save\n{Q.id=}\n{env.id=}\n{scene.id=}\n{pauseDelay=}\n{n_epochs=}\n"
setup += f"{n_steps=}\n{n_actions=}\n{n_states=} - states for non deep\n{gamma=}\n{lr=}\n{epsilon=}\n{decayRate=}\n{epsilonMin=}\n{n_epochsBeforeDecay=}\n"
setup += f"{numOfInputs=} - states for deep\n{minReplayMemSize=}\n{replayMemSize=}\n{batchSize=}\n{updateRate=}\n{loadModel=}\n{movingRate=}\n"
setup += f"{randomDesiredState=}\n{desiredRollRange=}\n{desiredPitchRange=}\n{startingRollRange=}\n{startingPitchRange=}\n{startingVelocity=}\n{stateDepth=}\n{Q.modelSummary=}\n{notes=}\n"
print(setup, file=open("./Experiments/" + str(experimentName) + "/setup.out", 'w')) # saves hyperparameters to the experiment folder
# prints out all metrics
def log(i_epoch, i_step, reward, logList):
global timeStart # Used to print time ellapsed between log calls
global timeEnd # Used to print time ellapsed between log calls
old_state = logList[0]
new_state = logList[1]
actions_binary = logList[3]
observation = logList[4]
control = logList[5]
explore = logList[6]
currentEpsilon = logList[7]
if(Q.id == "deep" or Q.id == "doubleDeep"):
depth = len(old_state)
depth = "Depth " + str(depth)
old_state = old_state[-1]
new_state = new_state[-1]
else:
depth = ""
timeEnd = time.time() # End timer here
print("\t\tGame ", i_epoch,
"\n\t\t\tMove ", i_step,
"\n\t\t\tStarting Rotation ", np.array(env.startingOrientation).round(logDecimals),
"\n\t\t\tDestination Rotation ", env.desiredState,
"\n\t\t\tTime taken ", timeEnd - timeStart,
"\n\t\t\tOld State ", np.array(old_state).round(logDecimals), depth,
"\n\t\t\tNew State ", np.array(new_state).round(logDecimals), depth,
"\n\t\t\t\t\t[p+,p-,r+,r-]",
"\n\t\t\tactions_binary = ", actions_binary,
"\n\t\t\tCurrent Control:", control,
"\n\t\t\tCurrent Qs:", Q.currentTable,
"\n\t\t\tCurrent Orientation: ", np.array(observation[dictObservation["pitch"]:dictObservation["gear"]]).round(logDecimals),
"\n\t\t\tCurrent AVE of QTable: ", np.average(Q.qTable),
"\n\t\t\tExplored (Random): ", explore,
"\n\t\t\tCurrent Epsilon: ", currentEpsilon,
"\n\t\t\tCurrent Reward: ", reward,
"\n\t\t\tReconnects Percentage & Count: ", float(connectAttempts / (i_epoch * n_steps + i_step + 1)), ",", connectAttempts,
"\n\t\t\tError Percentage & Count: ", float(errors / (i_epoch * n_steps + i_step + 1)), ",", errors,
"\n\t\t\tError Code: ", dictErrors, "\n")
timeStart = time.time() # Start timer here
# A single step(input), this will repeat n_steps times throughout a epoch
def step(i_step, done, reward, oldState):
global errors
global connectAttempts
if(Q.id == "deep" or Q.id == "doubleDeep"):
oldState = list(oldState)
action, explore, currentEpsilon = Q.selectAction(oldState, i_epoch, n_epochs)
# Check if connections can be established 10x
for attempt in range(10):
try:
newState, reward, done, info = env.step(action)
if(i_step == n_steps):
done = True # mark done if episode is finished
except socket.error as socketError: # the specific error for connections used by xpc
dictErrors["step"] = socketError
connectAttempts += 1
continue
else:
break
else: # if all 10 attempts fail
errors += 1
if(Q.id == "deep" or Q.id == "doubleDeep"):
newState = fallbackState
else:
newState = 0
reward = 0
done = False
info = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], 0]
pass # Error was in second loop
newPosition = info[0]
actions_binary = info[1]
control = info[2]
# checking if state includes a NaN (happens in JSBSim sometimes)
if(np.isnan(newState).any()):
if(Q.id == "deep" or Q.id == "doubleDeep"):
newState = fallbackState
else:
newState = 0
reward = 0
info = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], 0]
dictErrors["step"] = "NaN in state"
errors += 1
done = True
Q.learn(oldState, action, reward, newState, done)
logList = [oldState, newState, action, actions_binary, newPosition, control, explore, currentEpsilon]
if(Q.id == "deep" or Q.id == "doubleDeep"):
oldState = list(newState)
else:
oldState = newState
return done, reward, logList, oldState
# A epoch is one full run, from respawn/reset to the final step.
def epoch(i_epoch):
global errors
global connectAttempts
epochReward = 0
epochQ = 0
for attempt in range(25):
try:
oldState = env.reset()
except socket.error as socketError: # the specific error for connections used by xpc
dictErrors["reset"] = socketError
connectAttempts += 1
continue
else:
break
else: # if all 25 attempts fail
if(Q.id == "deep" or Q.id == "doubleDeep"):
oldState = fallbackState # Error was during reset
else:
oldState = 0
errors += 1
if(i_epoch % savePeriod == 0):
Q.archive(i_epoch)
done = False
reward = 0
for i_step in range(n_steps + 1):
done, reward, logList, oldState = step(i_step, done, reward, oldState)
epochReward += reward
epochQ += np.argmax(Q.currentTable)
if(i_step % logPeriod == 0): # log every logPeriod steps
log(i_epoch, i_step, reward, logList)
dictErrors["reset"], dictErrors["update"], dictErrors["step"] = [0, 0, 0]
if done:
break
epochRewards.append(epochReward)
epochQs.append(epochQ)
if(i_epoch % movingRate == 0 and i_epoch != 0):
movingEpRewards["epoch"].append(i_epoch)
averageReward = sum(epochRewards[-movingRate:]) / len(epochRewards[-movingRate:])
movingEpRewards["average"].append(averageReward)
movingEpRewards["minimum"].append(min(epochRewards[-movingRate:]))
movingEpRewards["maximum"].append(max(epochRewards[-movingRate:]))
averageQ = sum(epochQs[-movingRate:]) / len(epochQs[-movingRate:])
movingEpRewards["averageQ"].append(averageQ)
movingEpRewards["epsilon"].append(logList[7])
for i_epoch in range(startingOffset, startingOffset + n_epochs + 1):
epoch(i_epoch)
if(i_epoch % savePeriod == 0):
np.save("./Experiments/" + str(experimentName) + "/results" + str(i_epoch) + ".npy", movingEpRewards)
if(saveForAutoReload):
np.save("results.npy", movingEpRewards)
if(saveResultsToPlot and i_epoch % movingRate == 0):
plt.plot(movingEpRewards['epoch'], movingEpRewards['average'], label="average rewards")
plt.plot(movingEpRewards['epoch'], movingEpRewards['averageQ'], label="average Qs")
plt.plot(movingEpRewards['epoch'], movingEpRewards['maximum'], label="max rewards")
plt.plot(movingEpRewards['epoch'], movingEpRewards['minimum'], label="min rewards")
plt.plot(movingEpRewards['epoch'], movingEpRewards['epsilon'], label="epsilon")
plt.title("Results")
plt.xlabel("episodes")
plt.ylabel("reward")
plt.legend(loc=4)
plt.savefig("./Experiments/" + str(experimentName) + "/plot" + str(i_epoch) + ".png")
plt.clf()
np.save("./Experiments/" + str(experimentName) + "/results_final.npy", movingEpRewards)
endTime = str(time.ctime(time.time()))
# saving setup post run
setup = f"{experimentName=}\n{Q.numGPUs=}\n{dateTime=}\n{endTime=}\n{Q.id=}\n{env.id=}\n{scene.id=}\n{pauseDelay=}\n{n_epochs=}\n"
setup += f"{n_steps=}\n{n_actions=}\n{n_states=} - states for non deep\n{gamma=}\n{lr=}\n{epsilon=}\n{decayRate=}\n{epsilonMin=}\n{n_epochsBeforeDecay=}\n"
setup += f"{numOfInputs=} - states for deep\n{minReplayMemSize=}\n{replayMemSize=}\n{batchSize=}\n{updateRate=}\n{loadModel=}\n{movingRate=}\n"
setup += f"{randomDesiredState=}\n{desiredRollRange=}\n{desiredPitchRange=}\n{startingRollRange=}\n{startingPitchRange=}\n{startingVelocity=}\n{stateDepth=}\n{Q.modelSummary=}\n{notes=}\n"
print(setup, file=open("./Experiments/" + str(experimentName) + "/setup.out", 'w')) # saves hyperparameters to the experiment folder
print("<<<<<<<<<<<<<<<<<<<<DONE>>>>>>>>>>>>>>>>>>>>>")