#include	<stdio.h>
#include	"smaze.h"

double	ExplorationRate = 0.1,
	LearningRate = 0.2,
	DiscountRate = 0.9,
	InitialQValue = 0.5;
LearningStrategy	Strategy = OneStepQ;

extern	Agent	A;
extern unsigned long	Step;
extern double	drand48();

brain_reset()
{
	int	i, j, k;
	A.hindex = A.hsize = 0;
	memset(A.QTable,0,sizeof(A.QTable));
	for (j = 0; j < VGrids; j ++) for (i = 0; i < HGrids; i ++)
	for (k = 0; k < NActions; k ++) {
		A.QTable[j][i][k] = InitialQValue;
		A.WModel[j][i][k][0] = A.WModel[j][i][k][1] = -1;
		A.WModelReward[j][i][k] = 0.;
	}
}
short_random(mod) int mod;
{
	int	v;
	static long k = 0;
	if (k < mod) k = lrand48();
	v = k % mod; k /= mod;
	return v;
}
static short arg_max_Q(x,y)
short	x, y;
{
	short	i, n, best[NActions];
	double	q;
	n = 1; best[0] = 0; q = A.QTable[y][x][0];
	for (i = 1; i < NActions; i ++) {
		if (q < A.QTable[y][x][i]) {
			q = A.QTable[y][x][i]; n = 1; best[0] = i;
		} else if (q == A.QTable[y][x][i]) best[n ++] = i;
	}
	return (n > 1)? best[short_random(n)] : best[0];
}
static double max_Q(x,y)
short	x, y;
{
	int	i, n;
	double	max;
	n = 1; max = A.QTable[y][x][0];
	for (i = 1; i < NActions; i ++)
		if (max < A.QTable[y][x][i]) max = A.QTable[y][x][i];
	return max;
}
static void record_model()
{
	A.WModel[A.y][A.x][A.a][0] = A.newx;
	A.WModel[A.y][A.x][A.a][1] = A.newy;
	A.WModelReward[A.y][A.x][A.a] = A.reward;
}
static void record_history()
{
	Episode		*hist = A.history + A.hindex;
	if (A.hsize < HistoryLength) A.hsize ++;
	hist->x = A.x; hist->y = A.y;
	hist->a = A.a;
	hist->reward = A.reward;
}
policy()
{
	A.a = (drand48() < ExplorationRate)?
		short_random(NActions) : arg_max_Q(A.x,A.y);
	if (Strategy == BackPropQ) record_history();
}
static void modify_qvalue(x,y,a,newx,newy,r)
short x,y,a,newx,newy;
double	r;
{
	A.QTable[y][x][a] += LearningRate *
		(r + DiscountRate * max_Q(newx,newy) - A.QTable[y][x][a]);
}
static void learnDynaQ()
{
	int	i;
	short	x, y, a;
	record_model();
	modify_qvalue(A.x,A.y,A.a,A.newx,A.newy,A.reward);
	for (i = 0; i < HistoryLength; i ++) {
		x = short_random(HGrids); y = short_random(VGrids);
		a = short_random(NActions);
		if (A.WModel[y][x][a][0] >= 0)
			modify_qvalue(x,y,a,
			A.WModel[y][x][a][0],A.WModel[y][x][a][1],
			A.WModelReward[y][x][a]);
	}
}
static void learnBackPropQ()
{
	int	n, index;
	short	newx = A.newx, newy = A.newy;
	double	r = A.reward;
	Episode		*hist;
	n = ((A.hsize >= HistoryLength)? HistoryLength : A.hindex) - 1;
	for (index = A.hindex; n > 0; n --) {
		hist = A.history + index;
		modify_qvalue(hist->x,hist->y,hist->a,newx,newy,r);
		newx = hist->x; newy = hist->y;
		r = hist->reward;
		index = (index + HistoryLength - 1) % HistoryLength;
	}
	A.hindex = (A.hindex + 1) % HistoryLength;
}
learn()
{
	switch (Strategy) {
		case OneStepQ:
		modify_qvalue(A.x,A.y,A.a,A.newx,A.newy,A.reward); break;
		case DynaQ: learnDynaQ(); break;
		case BackPropQ: learnBackPropQ();
	}
}