%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% FILE: hybrid_least_squares_example.m %%% %%% %%% %%% This is a simple two-room MDP demo showing how the hybrid %%% %%% least-squares algorithm behaves when used in policy iteration. %%% %%% The hybrid least-squares algorithm effectively combines the %%% %%% objective functions of: %%% %%% (1) the fixed-point method (i.e. least-squares TD) %%% %%% (2) Bellman residual minimization method. %%% %%% The hybrid algorithms use a parameter 0 <= beta <= 1. %%% %%% When beta=0, the method defaults to the fixed-point method. %%% %%% When beta=1, the method defaults to the Bellman residual %%% %%% minimization method. %%% %%% %%% %%% Linear least-squares methods solve the problem A*w = b. %%% %%% With each sample from a MDP, we %%% %%% incrementally build up an estimate of matrix A and vector b. %%% %%% When only a "single-sample" or "single-transition" is available %%% %%% from each state, it is well-known that the Bellman residual %%% %%% method produces a biased estimate of the matrix A and that the %%% %%% fixed-point method produces an unbiased estimate of the matrix A. %%% %%% This demo shows one set of plots using a biased estimator of %%% %%% matrix A and another set of plots using an unbiased estimator. %%% %%% When beta=0, these two sets of plots are identical because the %%% %%% fixed-point method is unbiased even with "single-samples." %%% %%% %%% %%% For further details, please see: %%% %%% %%% %%% Hybrid Least-Squares Algorithms for Approximate Policy Evaluation %%% %%% Jeff Johns, Marek Petrik, and Sridhar Mahadevan %%% %%% Machine Learning 76 (2-3), 243-256, 2009. %%% %%% %%% %%% Jeff Johns, UMass Amherst 2009 %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Define a two-room MDP where each %%% %%% room is of size "nx" by "ny" %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% nx = 11; ny = 11; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Generate a matrix "O" which %%% %%% labels the state space %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% MX = nx+2; MY = 2*ny+3; MYmid = ny+2; MXmid = (nx+3)/2; %%% Occupancy matrix (1=state, 0=wall) O = ones(MY,MX); O(:,1) = 0; O(:,end) = 0; O(1,:) = 0; O(end,:) = 0; O(MYmid,:) = 0; O(MYmid,MXmid) = 1; %%% this is the doorway state between the two rooms %%% Give each state its own number n = 0; for i = 1:MY for j = 1:MX if (O(i,j)==1) n = n + 1; O(i,j) = n; end end end %%% Create an index into matrix O for easy plotting later idxO = find(O); [ignore, ix] = sort(O(idxO)); idxO = idxO(ix); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Define the probability of transitioning between the states %%% %%% using the four canonical actions (north, east, south, west). %%% %%% An action succeeds with probability "pSucc"; failed actions %%% %%% result in a transition using one of the other 3 actions. %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% pSucc = .8; P = zeros(n, n, 4); for i = 1:MY for j = 1:MX if (O(i,j)>0) state = O(i,j); if (O(i-1,j)>0) nextstate_north = O(i-1,j); else nextstate_north = state; end if (O(i+1,j)>0) nextstate_south = O(i+1,j); else nextstate_south = state; end if (O(i,j+1)>0) nextstate_east = O(i,j+1); else nextstate_east = state; end if (O(i,j-1)>0) nextstate_west = O(i,j-1); else nextstate_west = state; end %%% Action 1 (North) P(state, nextstate_north, 1) = pSucc; P(state, nextstate_east, 1) = P(state, nextstate_east, 1) + (1-pSucc)/3; P(state, nextstate_south, 1) = P(state, nextstate_south, 1) + (1-pSucc)/3; P(state, nextstate_west, 1) = P(state, nextstate_west, 1) + (1-pSucc)/3; %%% Action 2 (East) P(state, nextstate_east, 2) = pSucc; P(state, nextstate_north, 2) = P(state, nextstate_north, 2) + (1-pSucc)/3; P(state, nextstate_south, 2) = P(state, nextstate_south, 2) + (1-pSucc)/3; P(state, nextstate_west, 2) = P(state, nextstate_west, 2) + (1-pSucc)/3; %%% Action 3 (South) P(state, nextstate_south, 3) = pSucc; P(state, nextstate_north, 3) = P(state, nextstate_north, 3) + (1-pSucc)/3; P(state, nextstate_east, 3) = P(state, nextstate_east, 3) + (1-pSucc)/3; P(state, nextstate_west, 3) = P(state, nextstate_west, 3) + (1-pSucc)/3; %%% Action 4 (West) P(state, nextstate_west, 4) = pSucc; P(state, nextstate_north, 4) = P(state, nextstate_north, 4) + (1-pSucc)/3; P(state, nextstate_east, 4) = P(state, nextstate_east, 4) + (1-pSucc)/3; P(state, nextstate_south, 4) = P(state, nextstate_south, 4) + (1-pSucc)/3; end end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Define the reward function and discount factor. %%% %%% Just use a simple reward of 0 everywhere except %%% %%% +1 in one corner state. %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% R = zeros(n, 1); R(1) = 1; discount = .95; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Create some basis functions for the MDP. You can plug in anything %%% %%% here, but I'll just use proto-value functions as defined in: %%% %%% %%% %%% Proto-value Functions: A Laplacian Framework for Learning %%% %%% Representation and Control in Markov Decision Processes %%% %%% Sridhar Mahadevan and Mauro Maggioni %%% %%% Journal of Machine Learning Research 8, 2169-2231, 2007. %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% W = zeros(n, n); for action = 1:4 W = W + P(:, :, action); end W = double((W+W')>0); L = diag(sum(W')) - W; %%% combinatorial graph Laplacian [eigvec, eigval] = eig(L); [eigval, idx] = sort(diag(eigval)); eigvec = eigvec(:, idx); %%% How many eigenvectors to use for the approximation? nEigs = 25; %%% Since the basis functions need to be defined over state-action space, %%% just use the same set of eigenvectors for each of the four actions %%% NOTE: This ordering of matrix Phi means the Q-function is a %%% vector of length 4*n where: %%% Q(1) = Q-value for state 1, action 1 %%% Q(n) = Q-value for state n, action 1 %%% Q(n+1) = Q-value for state 1, action 2 %%% ... %%% Q(4*n) = Q-value for state n, action 4 Phi = zeros(4*n, 4*nEigs); for action = 1:4 Phi( ((1:n)+(n*(action-1))), ((1:nEigs)+(nEigs*(action-1))) ) = eigvec(:, 1:nEigs); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Hybrid Least-Squares Parameter %%% %%% Set: 0<= beta <= 1 %%% %%% beta=0 : fixed-point method %%% %%% beta=1 : Bellman residual minimization method %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% beta = 0.5; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Define the number of rounds of policy iteration %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% n_policy_iteration_rounds = 30; initial_policy = ceil(4*rand(n,1)); %%% random initial policy %%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Policy Iteration Loop %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%% policy_biased_A = initial_policy; policy_unbiased_A = initial_policy; fig=figure(22); set(fig,'units','normalized','outerposition',[0 0 1 1]); for iter = 1:n_policy_iteration_rounds %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Policy Evaluation using both the %%% %%% Biased and Unbiased Estimators of Matrix A %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% biased = [true false]; for b_i = 1:length(biased) %%% Select the policy to evaluate if biased(b_i) policy = policy_biased_A; else policy = policy_unbiased_A; end %%% Save the policy now save_policy = policy; %%% Compute the transition matrix for this policy P_policy = zeros(4*n, 4*n); for i = 1:n for action = 1:4 state_action = i + (action-1)*n; for j = 1:n if (P(i,j,action)>0) nextstate_action = j + (policy(j)-1)*n; P_policy(state_action, nextstate_action) = P_policy(state_action, nextstate_action) + P(i,j,action); end end end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% The least-squares problem is A*w = b. The matrix A and vector b are as follows: %%% %%% %%% %%% A = (Phi - beta * discount * P_policy * Phi)' * (Phi - discount * P_policy * Phi) %%% %%% b = (Phi - beta * discount * P_policy * Phi)' * R %%% %%% %%% %%% The unbiased estimate of A is simply A (since we are not sampling in this demo). %%% %%% The biased estimate of A comes from the inability to represent the following term %%% %%% when using just "single-samples" from the MDP: %%% %%% %%% %%% beta * discount * discount * Phi' * (P_policy' * P_policy) * Phi %%% %%% %%% %%% When "single-samples" are used to estimate this quantity, you instead get: %%% %%% %%% %%% beta * discount * discount * Phi' * (diag(P_policy' * ones)) * Phi %%% %%% %%% %%% where "ones" is a vector of 1's of length 4*n. %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% b = (Phi - beta * discount * P_policy * Phi)' * [R; R; R; R]; A = Phi'*Phi - beta * discount * (P_policy * Phi)' * Phi - discount * Phi' * (P_policy * Phi); if biased(b_i) A = A + beta * discount * discount * Phi' * (diag(P_policy' * ones(4*n,1))) * Phi; else A = A + beta * discount * discount * Phi' * (P_policy' * P_policy) * Phi; end %%% Solve the least-squares problem w = A\b; Q_policy = Phi * w; %%% approximate Q-value for this policy %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Policy Improvement by Greedification %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Q_policy = reshape(Q_policy, n, 4); Q_max = zeros(n, 1); for i = 1:n [Q_max(i), policy(i)] = max(Q_policy(i,:)); end %%% Save the New Policy if biased(b_i) policy_biased_A = policy; plotrow=0; else policy_unbiased_A = policy; plotrow=3; end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%% Make the Following Three Plots: %%% %%% (1) the policy being evaluated %%% %%% (2) the maximum (over 4 actions) of Q_policy %%% %%% (3) the greedy policy associated with Q_policy %%% %%% %%% %%% The color scheme on the policies is: %%% %%% North=blue, East=green, South=orange, West=red %%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Oplot = zeros(size(O)); Oplot(idxO) = save_policy; subplot(2,3,1+plotrow); imagesc(Oplot); title(['Pol. being Eval., Round ' num2str(iter)]); Oplot = zeros(size(O)); Oplot(idxO) = policy; subplot(2,3,3+plotrow); imagesc(Oplot); title('Greedy Policy'); Oplot = zeros(size(O)); Oplot(idxO) = Q_max; subplot(2,3,2+plotrow); surf(Oplot); set(gca, 'YDir', 'reverse'); if biased(b_i) title(['max(Q), beta=' num2str(beta) ', Biased A']); else title(['max(Q), beta=' num2str(beta) ', Unbiased A']); end end drawnow; end