#include "stdint.h"

typedef int     state_t;  /* 0 <= .. < state_count            */
typedef int    action_t;  /* 0 <= .. < succ_count             */
typedef double reward_t;  /* stochastic : independant normal distribution on each state : 
			     0 <= mean < 1, 
			     sigma_min <= std dev < sigma_max */

/**
 * Builds a MDP with state_count states.
 *
 * Each state has succ_count successors. Some of them may happen to be equal.
 *  Transitions are deterministic: action i consists in jumping to i-th 
 *  (known) successor.
 *
 * Successors are randomly, approximately (see below) uniformly distributed. 
 *  There is no spatial regularity of any kind.
 * 
 * The MDP graph is strongly connex : there is a path between any two states.
 *  By ensuring this property, the construction of the graph may harm the 
 *  uniformity of successors distribution, especially with only two successors
 *  per state: the graph can then be viewed as a directed circle plus (uniform)
 *  random edges.   
 *
 * The reward for a transition depends only on the reached state. It is
 *  stochastic, following a Normal distribution. For each state, the 
 *  mean and standard deviation of rewards are chosen randomly, 
 *  uniformly, over [0,1) and [sigma_min, sigma_max).
 *  sigma_min and sigma_max are set randomly using the following formulae:
 *    sigma_min = 0.5 * exp(-1 / X^2),  sigma_max = 0.5 * exp(-0.5 / Y), 
 *  where X and Y are uniform random over [0,1).
 *  If sigma_min happens to be superior to sigma_max, they swap.
 * 
 * Each state is referred to by an index >=0 (type state_t = int). 
 * 
 * The MDP cannot be set to an arbitrary state: one can only use regular
 *  transitions from the current state. The initial state is the one 
 *  with index 0 (which is the value returned by the constructor function). 
 *  No reward is associated to this initial state at time 0.
 *
 * The MDP depends only on the given random seed, regardless of the machine,
 *  system, time, or whatever. 
 *  Each state has its own random number generator, thus the sequence of
 *  rewards in that state will always be the same, regardless of the order 
 *  of visits or, again, the machine or system.
 *
 * Five statistics files are written during the MDP walk, in the current 
 *  directory, all named after the parameters of this constructor. 
 *  The time interval between each writing of fresh statistics is the number 
 *  of states.
 *  - the first one, suffixed by '.return.dat', records a soft moving average
 *   of backward return (sum of backward-discounted rewards), against time. 
 *      R(g,t) = r(t) + g * r(t-1) + g^2 * r(t-2) + ...
 *   g is defined such that the horizon 1 + g + g^2 + ... = 1/(1+g) is equal 
 *   to the square root of the number of states : g = 1 - 1/sqrt(state_count).
 *   The soft average of this quantity is computed by using a larger horizon 
 *   equal to the number of states : 
 *      g2 = 1 - 1/state_count,
 *   and normalizing this long term return by 
 *      R_avg(g,t) = R(g2,t) * (1-g2)/(1-g)
 *  - the second one, suffixed by '.visits.dat', records how many times each 
 *  state has been visited, against time.
 *  - the third one, suffixed by '.curr_visits.dat', shows the same thing,
 *  but without keeping the history. It may be useful for monitoring via
 *  a plot software.
 *  - the fourth one, suffixed by '.mdp.dat', reveals for each state its
 *  successors, the mean & standard deviation of rewards, and its value under
 *  optimal policy (with respect to the discount factor defined above).
 *  It also shows the (probably) optimal loop : the sequence of states an 
 *  agent ends looping in, when starting in state 0 and following the optimal
 *  policy. Note that since a discount factor is used, there is a (very tiny)
 *  chance that some other loop may have a higher value.
 *  - however, the average reward in this loop is considered as the optimal
 *  average reward r*, and the cumulated regret achieved by the agent at time 
 *  T is defined by
 *    T * r* - sum {t in 0..T} mu(t),  where mu(t) is the mean of rewards of 
 *  the state visited at time t.
 *  The last statistics file records this regret against time.
 */
state_t make_MDP (uint32_t seed       ,
		  int      state_count,
		  int      succ_count );

/**
 * successors id of current state.
 */ 
state_t successor (action_t u);

/**
 * Sets *s to new current state = current state's u-th successor, 
 * and *r to a stochastic reward from that new state.
 */
void act (action_t u, state_t *s, reward_t *r);


/**
 * Deletes the MDP and closes statistics files.
 */
void close_MDP();