#include #include #include #include #include #include "../config.h" #include "gnuplot_i.h" #include "BanditWorld.h" #define SQ(x) (x*x) #define True 1 #define False 0 #define _RND_MAX 0xffffffff uint32_t rnd (uint32_t *seed) { uint64_t z = *seed; z *= 279470273; z %= 4294967291U; return (*seed = z); } uint32_t rnd_seed (uint32_t *seed) { return 16807 * rnd(seed) % 2147483647; } float rnd_float (uint32_t *seed) { return (float) rnd (seed) / _RND_MAX; } int rnd_int (uint32_t *seed, int max) { return (int)(rnd_float (seed) * (float)max); } int rnd_bool (uint32_t *seed) { return (rnd (seed) & 0x1000000)? True : False; } float rnd_normal (uint32_t *seed) { float x, y, r2; do { x = 2.0 * rnd_float (seed) - 1.0; y = rnd_float (seed); r2 = x*x + y*y; } while (r2>1.0 || r2==0); return x * sqrt (-2.0*(log(r2))/r2); } struct state { state_t id; /* = index of state in state array */ uint32_t seed; /* state's own random seed */ double mu; /* mean of stochastic rewards */ double sigma; /* standard deviation of rewards */ struct state ** _succ; /* pointers to successors */ int visit_count; /* count of visits to the state */ double value; /* the actual value under optimal policy, with discount gama */ }; static struct state * _state = NULL; /* array of all states */ static struct state *cstate = NULL; /* pointer to current state */ static int stateCount = 0; static int succCount = 0; char fileradix[50]; char filename[100]; static FILE *freturn, *fregret, *fvisits, *fcvisits, *fmdp; /* statistics output files */ static double regret; static double avg_return; static int t; static int epoch; static double gama; static double gama2; static double r_star; static gnuplot_ctrl *gnuplot_h=NULL; static time_t last_plot=0; /** * Solves the MDP by value iteration. */ void solve (double epsilon) { int i, j, change = True; double x, max; struct state *s, *s2; struct state ** list, **top, **ptr; while (change) { change = False; for (i=0, s=_state; i_succ[0]->value; for (j=1; j_succ[j]->value; if (x>max) max = x; } x = s->mu + gama * max; if (x - s->value > epsilon) {s->value = x; change = True;} } } for (i=0, s=_state; imu, s->sigma, s->value); for (j=0; j_succ[j]->id); } fprintf (fmdp, "\n\n# Optimal loop\n"); top = list = malloc (stateCount * sizeof(struct state *)); *top = NULL; *(++top) = _state; do { s = *top; s2 = s->_succ[0]; max = s2->value; for (j=1; j_succ[j]->value > max) {max = s->_succ[j]->value; s2 = s->_succ[j];} ptr=top; while (*ptr != s2 && *ptr != NULL) ptr--; *(++top) = s2; } while (*ptr == NULL); x = 0.0; for (i=1, s=*(++ptr); ; i++, s=*(++ptr)) { x += s->mu; fprintf (fmdp, "\n %6d %.5f %.5f %.5f ", s->id, s->mu, s->sigma, s->value); for (j=0; j_succ[j]->id); if (s==s2) break; } r_star = x/i; fclose (fmdp); free (list); } void close_MDP() { int i; struct state *s; if (cstate) { fclose(fregret); fclose(freturn); fclose(fvisits); for (i=0, s=_state; i_succ); free (_state); cstate = NULL; } #ifdef HAVE_GNUPLOT if ( gnuplot_h != NULL ) gnuplot_close(gnuplot_h); #endif } state_t make_MDP (uint32_t seed, int state_count, int succ_count) { int i, j, n; struct state *s, *s2; struct state ** stack = malloc (state_count * sizeof(struct state *)); struct state **top = stack; int nb_unconnected; int change; int stack_size; double x, sigma_min, sigma_max; close_MDP(); #ifdef HAVE_GNUPLOT /* Initialize gnuplot */ gnuplot_h = gnuplot_init(); gnuplot_cmd(gnuplot_h, "set key right bottom"); #endif stateCount = state_count; succCount = succ_count; snprintf(fileradix, 50, "bandit_world_%u_%d_%d", seed, state_count, succ_count); sprintf (filename, "%s.regret.dat", fileradix); fregret = fopen (filename, "w"); fprintf (fregret, "# time cumulated regret\n"); sprintf (filename, "%s.return.dat", fileradix); freturn = fopen (filename, "w"); fprintf (freturn, "# time average return\n"); sprintf (filename, "%s.visits.dat", fileradix); fvisits = fopen (filename, "w"); fprintf (fvisits, "# time state visits\n"); sprintf (filename, "%s.mdp.dat", fileradix); fmdp = fopen (filename, "w"); fprintf (fmdp, "# state mean std_dev value successors\n"); sprintf (filename, "%s.curr_visits.dat", fileradix); regret = 0.0; avg_return = 0.0; t = 0; epoch = state_count; gama = 1.0 - 1.0 / sqrt((float) state_count); gama2 = 1.0 - 1.0 / (float)state_count; seed += state_count + succ_count; sigma_min = exp (-2.0 / SQ(rnd_float(&seed))); sigma_max = exp (-1.0 / rnd_float(&seed)); if (sigma_min > sigma_max) {x = sigma_min; sigma_min = sigma_max; sigma_max = x;} _state = malloc (state_count * sizeof(struct state)); for (i=0, s=_state; iid = i; s->seed = rnd_seed (&seed); s->mu = rnd_float (&seed); s->sigma = sigma_min + (sigma_max-sigma_min) * rnd_float (&seed); s->visit_count = 0; s->_succ = NULL; s->value = 0.0; } _state[0].visit_count = 1; _state[0]._succ = malloc (succ_count * sizeof(struct state *)); nb_unconnected = state_count-1; *top = NULL; *(++top) = _state; /* Iter : - take (from the stack) a state that is a successor but has no successors. (the stack was initialized with the initial state). - randomly set its first successors from the still unconnected states, if any. - randomly set following successors except the last from all states. At the end, all states are reachable from the initial state. */ while ((s = *(top--)) != NULL) { if (nb_unconnected) { n = rnd_int (&seed, nb_unconnected); for (i=0, s2=_state; ivisit_count) s2++; while (s2->visit_count) s2++; } else { s2 = _state + rnd_int(&seed,state_count); } for (i=0; ivisit_count == 0) { s2->visit_count = 1; nb_unconnected--; } if (s2->_succ == NULL) { s2->_succ = malloc (succ_count * sizeof(struct state *)); *(++top) = s2; } s->_succ[i] = s2; s2 = _state + rnd_int(&seed,state_count); } } /* flag states from which inital state is reachable (we use visit_count==0 as a flag) */ _state[0].visit_count = 0; do { change = False; s = _state; for (i=0; ivisit_count) { for (j=0; j_succ[j]->visit_count) { change = True; s->visit_count = 0; } } } } } while (change); /* fill the stack (that will not be used as a stack from now, actually) with those flagged states */ stack_size = 0; top = stack; for (i=0, s=_state; ivisit_count) { *(top++) = s; stack_size++; } } /* Set the last successor of all states: - if it can reach initial state, choose any, - else choose from states that can, and add state to the stack. */ for (i=0, s=_state; ivisit_count) { s->_succ[succ_count-1] = stack[rnd_int(&seed,stack_size)]; *(top++) = s; stack_size++; s->visit_count = 0; } else { s->_succ[succ_count-1] = _state + rnd_int(&seed,state_count); } } cstate = _state; free(stack); solve (0.000005); return 0; } state_t successor (action_t u) { return cstate->_succ[u]->id; } void act (action_t u, state_t *s, reward_t *r) { int i; struct state *st; cstate = cstate->_succ[u]; cstate->visit_count ++; *r = cstate->mu + cstate->sigma * rnd_normal (&(cstate->seed)); *s = cstate->id; regret += r_star - cstate->mu; avg_return *= gama2; avg_return += *r; if ((++t) % epoch == 0) { fprintf (freturn, "%7d %f\n", t, ((1-gama2)/(1-gama)) * avg_return); fflush (freturn); fprintf (fregret, "%7d %f\n", t, regret); fflush (fregret); for (i=0, st=_state; ivisit_count); fprintf (fvisits, "\n"); fflush (fvisits); fcvisits = fopen (filename, "w"); for (i=0, st=_state; ivisit_count); fprintf (fcvisits, "\n"); fclose (fcvisits); #ifdef HAVE_GNUPLOT time_t now; time(&now); if ( last_plot == 0 || (now-last_plot) > 2 ) { printf("Plotting...\n"); last_plot = now; plot(fileradix); } #endif } } void plot(const char *fileradix) { gnuplot_cmd(gnuplot_h, "set multiplot"); gnuplot_cmd(gnuplot_h, "set size 1,0.5"); gnuplot_cmd(gnuplot_h, "set origin 0,0"); gnuplot_cmd(gnuplot_h, "plot '%s.curr_visits.dat' w i notitle", fileradix); gnuplot_cmd(gnuplot_h, "set size 0.5,0.5"); gnuplot_cmd(gnuplot_h, "set origin 0,0.5"); gnuplot_cmd(gnuplot_h, "plot '%s.regret.dat' w l t 'regret'", fileradix); gnuplot_cmd(gnuplot_h, "set origin 0.5,0.5"); gnuplot_cmd(gnuplot_h, "plot '%s.return.dat' w l t 'return'", fileradix); gnuplot_cmd(gnuplot_h, "unset multiplot"); gnuplot_cmd(gnuplot_h, "pause 2"); }