from Numeric import array, transpose, power, matrixmultiply (ACTION1, ACTION2, ACTION3, ACTION_COUNT) = range(4) (STATE_A, STATE_B, STATE_C, STATE_COUNT) = range(4) # Action matrixes actions = range(ACTION_COUNT) actions[ACTION1] = array([[0.5, 0.25, 0.25], [0.125, 3/4.0, 0.125], [0.25, 0.125, 5/8.0]]) actions[ACTION2] = array([[1/2.0, 0, 1/2.0], [0, 0, 0], [1/16.0, 7/8.0, 1/16.0]]) actions[ACTION3] = array([[1/4.0, 1/4.0, 1/2.0], [1/8.0, 3/4.0, 1/8.0], [3/4.0, 1/16.0, 3/16.0]]) # Transition matrix transition = array([[5.0/12, 2.0/12, 5.0/12], [1.0/8, 6.0/8, 1.0/8], [17.0/48, 17.0/58, 15.0/58]]) # Reward matrixes rewards = range(ACTION_COUNT) rewards[ACTION1] = array([[10, 4, 8], [14, 0, 18], [10, 2, 8]]) rewards[ACTION2] = array([[8, 2, 4], [0, 0, 0], [6, 4, 2]]) rewards[ACTION3] = array([[4, 6, 4], [8, 16, 8], [4, 0, 8]]) # Pr[a_t | x_t = A] -> Pi(x_t, a_t) def reward(state, action): return matrixmultiply(actions[action][state, :], rewards[action][state, :]) r = 0 for a in range(ACTION_COUNT): state_r = 0 for s in range(STATE_COUNT): state_r += reward(s, a) state_r = state_r * 1/3 r += state_r print r