Scheduler
reinforcedLearning.cpp
Go to the documentation of this file.
1 
10 #include "reinforcedLearning.h"
11 
12 #include <cassert>
13 #include <cmath>
14 #include <iomanip>
15 #include <iostream>
16 #include <fstream>
17 #include <memory>
18 #include <stdexcept>
19 #include <vector>
20 
21 #include <utils/randomGenerator.h>
22 #include <utils/stringUtils.h>
23 
24 #include <mdp/mdpConfiguration.h>
25 #include <mdp/context.h>
26 #include <mdp/actionSpace.h>
27 #include <mdp/policy.h>
28 #include <mdp/stateSpace.h>
29 
30 #include "rlBackupAlgo/qLearning.h"
35 
39 
41 
42 using namespace Mdp;
43 
44 
45 ReinforcedLearning::ReinforcedLearning(std::shared_ptr<Context> c)
46  : LearningStrategy(c)
47  , S(c->stateSpace->size())
48  , A(c->actionSpace->size())
49  , actionValuesRecord(ActionValuesRecord(c->conf, S, A))
50  , rewardRecord(c->conf, "rewardRecord")
51 {
52  assert(context != nullptr);
53  assert(context->conf != nullptr);
54 }
55 
57 {
58  if (backupAlgo != nullptr)
59  delete backupAlgo;
60 }
61 
62 /*TODO: what to do with initializeModel vs constructor?
63  * initializeModel is a public function. Why? Where is it called?*/
65 {
66  previousState = context->stateSpace->getState();
67  std::cerr << "initial state is: " << previousState <<"\n";
69  assert(context != nullptr);
70  assert(context->conf != nullptr);
72  backupAlgo->init();
74  /*FIXME: REDUNDANT*/
78 }
79 
81 {
82 #ifdef PRINT
83  size_t S = context->stateSpace->size();
84  for (size_t s = 0; s < S; s++)
85  {
86  std::vector<size_t> *vect = context->stateSpace->factorize(s);
87  for (size_t i = 0; i < vect->size(); i++)
88  {
89  std::cerr << (*vect)[i] << " ";
90  }
91  std::cerr << "\n";
92  }
93 #endif
94 }
95 
97 {
98  std::string str = context->conf->getStringValue("reinforcementLearning", "actionSelectionStrategy");
99  if (!str.compare("epsilonGreedy"))
100  {
101  double epsilon = context->conf->getRlEpsilonFromFile();
102  double epsilonDecaySpeed = context->conf->getDoubleValue(
103  "reinforcementLearning", "epsilonDecaySpeed");
104  long long unsigned int epsilonTimeout = context->conf->getUnsignedLongLongIntValue(
105  "reinforcementLearning", "epsilonTimeOut");
106  actionSelectionStrategy = new EpsilonGreedy(epsilon, epsilonDecaySpeed, epsilonTimeout);
107  }
108  else if (!str.compare("greedy"))
109  {
110  actionSelectionStrategy = new EpsilonGreedy(0.0, 0.0, 0);
111  }
112  else if (!str.compare("Gibbs"))
113  {
114  double temperature = context->conf->getDoubleValue("reinforcementLearning", "GibbsTemperature");
115  double tempDecaySpeed = context->conf->getDoubleValue("reinforcementLearning", "GibbsTempDecaySpeed");
116  double tempStepSize = context->conf->getDoubleValue("reinforcementLearning", "GibbsTempStepSize");
117  actionSelectionStrategy = new GibbsActionSelection(temperature, tempDecaySpeed, tempStepSize);
118  }
119  else
120  {
121  throw std::invalid_argument("invalid value for actionSelectionStrategy");
122  }
123 }
124 
125 
127 {
128  std::string initStr = context->conf->getStringValue("reinforcementLearning", "initialPolicy");
129  if (!initStr.compare("uniform"))
130  {
131  context->policy->initializeUniformly();
132  }
133  else if (!initStr.compare("fromFile"))
134  {
135  /*TODO: path is hardcoded*/
136  context->policy->initializeFromFile("configuration/initialPolicy");
137  }
138  else
139  {
140  throw std::invalid_argument("initial policy not defined");
141  }
142 }
143 
145 {
146  std::string str = context->conf->getStringValue("reinforcementLearning", "algo");
147  if (!str.compare(QLearning::configKey))
148  {
150  return new QLearning(context, dynamic_cast<TabularActionValues*>(actionValues));
151  }
152  if (!str.compare(SarsaLambda::configKey))
153  {
155  return new SarsaLambda(context, dynamic_cast<TabularActionValues*>(actionValues));
156  }
157  if (!str.compare(DelayedQLearning::configKey))
158  {
160  return new DelayedQLearning(context, dynamic_cast<TabularActionValues*>(actionValues));
161  }
162  if (!str.compare(WatkinsQLambda::configKey))
163  {
165  return new WatkinsQLambda(context, dynamic_cast<TabularActionValues*>(actionValues));
166  }
167  if (!str.compare(NaiveQLambda::configKey))
168  {
170  return new NaiveQLambda(context, dynamic_cast<TabularActionValues*>(actionValues));
171  }
172  throw std::runtime_error("Reinforcement Learning algorithm lookup failed");
173 }
174 
175 
177 {
178  /*We use Q-learning*/
179  previousAction = context->actionSpace->getLastAction();
180  state_t newState = context->stateSpace->getState();
181  double reward = context->stateSpace->getReward();
182  if (reward == -HUGE_VAL)
183  reward = -1.0e100;
184 
185  static double discountFactor = context->conf->getDoubleValue("mdp", "discountFactor");
186  updateLongTermReward(reward, discountFactor);
187 
188  //updateActualDiscountedReward(reward);
192 
194  previousState = newState;
195 }
196 
198 {
200  longTermReward += reward;
201 }
202 
204 {
205  static long long int counter = 0;
207  actualDiscountedReward += reward;
208 
210 }
211 
213 {
214  static const bool updatePolicy = context->conf->getBoolValue("reinforcementLearning", "updatePolicy", true);
215  if (!updatePolicy)
216  {
217  return;
218  }
219  epsilonGreedyPolicyUpdate(state); //FIXME: remove epsilongreedy from the name
220 }
221 
223 {
224  action_t bestAction = getBestAction(state);
225  std::vector<double> av = actionValues->getValues(state);
226  //bestAction = getBestActionFromInitialPolicy(state);
227  std::vector<double> policy = actionSelectionStrategy->generatePolicy(av, bestAction);
228  context->policy->update(state, policy);
229 }
230 
232 {
233  size_t S = context->stateSpace->size();
234  size_t A = context->actionSpace->size();
235  static std::vector<std::vector<double>> init(S, std::vector<double>(A));
236  static bool valid = false;
237  static std::vector<action_t> bestAction(S);
238  if (!valid)
239  {
240  valid = true;
241  std::string filename = "configuration/initialPolicy";
242  std::fstream stream;
243  stream.open(filename);
244  if (!stream.is_open())
245  throw std::runtime_error("cannot open file");
246  std::string line;
247  std::vector<std::vector<double>> pol;
248  size_t a = 0;
249  while(std::getline(stream, line))
250  {
251  std::vector<std::string> elements = Utils::StringUtils::split(line, ' ');
252  std::vector<double> row;
253  /*TODO: this can be rewritten more elegantly*/
254  for (size_t i = 0; i < elements.size(); i++)
255  {
256  row.push_back(std::stod(elements[i]));
257  }
258  init[a++] = row;
259  }
260  double bestValue;
261  for (size_t s = 0; s < S; s++)
262  {
263  bestAction[s] = 0;
264  bestValue = init[s][0];
265  for (size_t a = 1; a < A; a++)
266  {
267  if (init[s][a] > bestValue)
268  {
269  bestValue = init[s][a];
270  bestAction[s] = a;
271  }
272  }
273  }
274  }
275 
276  return bestAction[s];
277 }
278 
280 {
281  return backupAlgo->getBestAction(state);
282 }
283 
284 
286 {
288 #ifdef PRINT
289  /*Action values are Q-learning's equivalent to mdp policy table*/
291 #endif
292  rewardRecord.printToFile("reports");
293 #ifdef PRINT
294  std::cerr << "the long-term reward is " << longTermReward <<"\n";
295 #endif
296 }
297 
298 
300 {
301  std::ofstream file;
302  std::ofstream normalized;
303  file.open(folder + "/rlfile.txt", std::ios_base::app);
304  normalized.open(folder + "/rlfilenormalized.txt", std::ios_base::app);
305  for (unsigned int i = 0; i < S; i++)
306  {
307  bool allEqual = true;
308  unsigned int maxIndex = 0;
309  double maxValue = actionValues->getValue(i, 0);
310  for (size_t j = 1; j < A; j++)
311  {
312  if (actionValues->getValue(i, j) > maxValue)
313  {
314  maxValue = actionValues->getValue(i, j);
315  maxIndex = j;
316  }
317  double eps = 0.0000001; //TODO: what value should this be? Maybe make it relative to the abs value of actionValues[i][j]
318  if (actionValues->getValue(i, j) < actionValues->getValue(i, j-1) - eps
319  || actionValues->getValue(i, j) > actionValues->getValue(i, j-1) + eps)
320  allEqual = false;
321  }
322  for (unsigned int j = 0; j < A; j++)
323  {
324  file << actionValues->getValue(i, j)<<" ";
325  normalized << ((allEqual == true) ? 1 : ((j == maxIndex) ? 1 : 0) ) <<" ";
326  }
327  file << "\n";
328  normalized << "\n";
329  }
330  file.close();
331  normalized.close();
332 }
333 
334 
335 
336 
337 
338 
339 
340 
341 
342 
343 
344 
345 
346 
347 
348 
349 
350 
351 
352 
353 
354 
355 
356 
357 
358 
359 
360 
361 
362 
363 
virtual std::vector< double > generatePolicy(const std::vector< double > &, action_t bestAction)=0
virtual void notifyUpdateNeeded()
virtual double getValue(state_t state, action_t action)=0
virtual void init()=0
const size_t A
const size_t S
ActionValuesFunction * actionValues
line
Definition: bigtemp.py:6
RlBackupAlgorithm * backupAlgo
static constexpr const char * configKey
Definition: qLearning.h:26
ActionValuesRecord actionValuesRecord
virtual std::vector< double > getValues(state_t state)=0
void printToFile(std::string folder) const
Definition: record.cpp:23
action_t getBestActionFromInitialPolicy(state_t s)
static constexpr const char * configKey
Definition: naiveQLambda.h:21
virtual void updateActionValues(state_t previousState, state_t nextState, action_t previousAction, double reward)=0
void updatePolicy(state_t state)
void updateLongTermReward(double reward, double discountFactor)
RlBackupAlgorithm * getBackupAlgorithm()
void add(double time, double element)
Definition: record.cpp:41
void recordActionValues(ActionValuesFunction *actionValues, state_t state, action_t action)
string filename
Definition: aging.py:5
action_t getBestAction(state_t state)
static const constexpr char * configKey
size_t action_t
Definition: action_impl.h:18
virtual action_t getBestAction(state_t state)
Definition: action.h:18
ActionSelectionStrategy * actionSelectionStrategy
static constexpr const char * configKey
Definition: sarsaLambda.h:21
const double pol
void epsilonGreedyPolicyUpdate(state_t state)
static constexpr const char * configKey
void printActionValuesToFile(std::string folder)
size_t state_t
Definition: state.h:19
ReinforcedLearning(std::shared_ptr< Context > context)
Definition: reward.py:1
void updateActualDiscountedReward(double reward)
std::shared_ptr< Context > context
static std::vector< std::string > split(std::string str, char delimiter)
Definition: stringUtils.cpp:18