#include #include "smaze.h" double ExplorationRate = 0.1, LearningRate = 0.2, DiscountRate = 0.9, InitialQValue = 0.5; LearningStrategy Strategy = OneStepQ; extern Agent A; extern unsigned long Step; extern double drand48(); brain_reset() { int i, j, k; A.hindex = A.hsize = 0; memset(A.QTable,0,sizeof(A.QTable)); for (j = 0; j < VGrids; j ++) for (i = 0; i < HGrids; i ++) for (k = 0; k < NActions; k ++) { A.QTable[j][i][k] = InitialQValue; A.WModel[j][i][k][0] = A.WModel[j][i][k][1] = -1; A.WModelReward[j][i][k] = 0.; } } short_random(mod) int mod; { int v; static long k = 0; if (k < mod) k = lrand48(); v = k % mod; k /= mod; return v; } static short arg_max_Q(x,y) short x, y; { short i, n, best[NActions]; double q; n = 1; best[0] = 0; q = A.QTable[y][x][0]; for (i = 1; i < NActions; i ++) { if (q < A.QTable[y][x][i]) { q = A.QTable[y][x][i]; n = 1; best[0] = i; } else if (q == A.QTable[y][x][i]) best[n ++] = i; } return (n > 1)? best[short_random(n)] : best[0]; } static double max_Q(x,y) short x, y; { int i, n; double max; n = 1; max = A.QTable[y][x][0]; for (i = 1; i < NActions; i ++) if (max < A.QTable[y][x][i]) max = A.QTable[y][x][i]; return max; } static void record_model() { A.WModel[A.y][A.x][A.a][0] = A.newx; A.WModel[A.y][A.x][A.a][1] = A.newy; A.WModelReward[A.y][A.x][A.a] = A.reward; } static void record_history() { Episode *hist = A.history + A.hindex; if (A.hsize < HistoryLength) A.hsize ++; hist->x = A.x; hist->y = A.y; hist->a = A.a; hist->reward = A.reward; } policy() { A.a = (drand48() < ExplorationRate)? short_random(NActions) : arg_max_Q(A.x,A.y); if (Strategy == BackPropQ) record_history(); } static void modify_qvalue(x,y,a,newx,newy,r) short x,y,a,newx,newy; double r; { A.QTable[y][x][a] += LearningRate * (r + DiscountRate * max_Q(newx,newy) - A.QTable[y][x][a]); } static void learnDynaQ() { int i; short x, y, a; record_model(); modify_qvalue(A.x,A.y,A.a,A.newx,A.newy,A.reward); for (i = 0; i < HistoryLength; i ++) { x = short_random(HGrids); y = short_random(VGrids); a = short_random(NActions); if (A.WModel[y][x][a][0] >= 0) modify_qvalue(x,y,a, A.WModel[y][x][a][0],A.WModel[y][x][a][1], A.WModelReward[y][x][a]); } } static void learnBackPropQ() { int n, index; short newx = A.newx, newy = A.newy; double r = A.reward; Episode *hist; n = ((A.hsize >= HistoryLength)? HistoryLength : A.hindex) - 1; for (index = A.hindex; n > 0; n --) { hist = A.history + index; modify_qvalue(hist->x,hist->y,hist->a,newx,newy,r); newx = hist->x; newy = hist->y; r = hist->reward; index = (index + HistoryLength - 1) % HistoryLength; } A.hindex = (A.hindex + 1) % HistoryLength; } learn() { switch (Strategy) { case OneStepQ: modify_qvalue(A.x,A.y,A.a,A.newx,A.newy,A.reward); break; case DynaQ: learnDynaQ(); break; case BackPropQ: learnBackPropQ(); } }