我的神经网络无法使用tensorflow.net和qlearning解决迷宫

发布于 2025-01-10 13:29:55 字数 16072 浏览 6 评论 0原文

我正在使用 TensorFlow 和 QLearning 练习神经网络。对于我的项目，我使用 C# 进行工作，以便稍后能够将我的程序迁移到 Unity 游戏引擎上。我使用 TensorFlow.Net 库： https://github.com/SciSharp/TensorFlow.NET

首先我的目标是训练神经网络通过避开禁区来找到迷宫的到达点。

我的环境是一个int数组（3*3）；当前位置由玩家的 X 和 Y 位置指示。

PlayerX : 0

PlayerY : 2

[  0  0  1
 0  -1  0 
 PLAYER  0  0 ]

我有四种可能的操作。左、右、上、下。

我获得的奖励取决于我执行某项操作时所处的位置。

0：免费 -1：禁止 1：到达 -0.25：访问过 -0.75：位置不变

我的神经网络有 9 个与环境状态相对应的输入。与概率相对应的 4 个输出指定要采取的最佳行动。我有一个包含 15 个神经元的隐藏层。输出的激活函数是Relu和softmax。我使用 Adam 优化器来训练我的模型。

我执行我的程序，但是当测试阶段它仍然没有到达时，发现到达点并仍然停留在盒子限制上并在循环中执行相同的操作。

https://i.sstatic.net/yedn9.png

训练模型代码

public void TrainModel(float[,] states, float[,] actions, float[,] rewards, float[,] next_states)
{
    Tensor tf_states = tf.convert_to_tensor(states, TF_DataType.TF_FLOAT);
    Tensor tf_rewards = tf.convert_to_tensor(rewards, TF_DataType.TF_FLOAT);
    Tensor tf_next_states = tf.convert_to_tensor(next_states, TF_DataType.TF_FLOAT);
    Tensor tf_actions = tf.convert_to_tensor(actions, TF_DataType.TF_FLOAT);
    List<float> losses = new List<float>();

    int size = (int)tf_next_states.shape.dims[0];

    Tensor Q_stp1 = neuralNet.Apply(tf_next_states, training: true);
    Tensor argmax = tf.cast(tf.max(Q_stp1, 1), TF_DataType.TF_FLOAT);
    Tensor argmaxExpand_dims = tf.expand_dims(argmax, 1);
    Tensor applyScalar = tf.multiply(argmaxExpand_dims, 0.99f);
    Tensor applyRewards = tf.add(applyScalar, tf_rewards);
    int count = (int)applyRewards.shape.dims[0];

    Tensor Qtargets = tf.convert_to_tensor(new NDArray(applyRewards.BufferToArray(), (count, 1), TF_DataType.TF_FLOAT));


    Func<Tensor, Tensor, Tensor, Tensor> model_loss = (tf_states, tf_actions, Qtargets) =>
    {
        Tensor result = neuralNet.Apply(tf_states, training: true);
        Tensor subtract = tf.subtract(result, Qtargets);
        Tensor square = tf.square(subtract);
        Tensor loss = tf.multiply(square, tf_actions);

        return tf.reduce_mean(loss);
    };

    Action<Tensor, Tensor, Tensor> run_optimization = (tf_states, tf_actions, Qtargets) =>
    {
        // Wrap computation inside a GradientTape for automatic differentiation.
        using var g = tf.GradientTape();
        // Forward pass.
        var loss = model_loss(tf_states, tf_actions, Qtargets);
        losses.Add(loss.BufferToArray()[0]);

        // Compute gradients.
        var gradients = g.gradient(loss, neuralNet.trainable_variables);

        // Update W and b following gradients.
        optimizer.apply_gradients(zip(gradients, neuralNet.trainable_variables.Select(x => x as ResourceVariable)));
    };

    for (int b = 0; b < size; b += batch_size)
    {
        var to = (b + batch_size < size) ? b + batch_size : b + (size - b);
        var tf_states_b = tf_states.slice(new Slice(b, to));
        var tf_actions_b = tf_actions.slice(new Slice(b, to));
        var Qtargets_b = Qtargets.slice(new Slice(b, to));

        // Minimize the error
        run_optimization(tf_states_b, tf_actions_b, Qtargets_b);
    }

    message += "Mean loss : " + Average(losses.ToArray()) + "\n\n\n";
}

源代码：

MazeNetworkLearning MazeNetworkLearning = new MazeNetworkLearning();
MazeNetworkLearning.Start();

public class MazeNetworkLearning
{
    enum EActions
    {
        LEFT = 0,
        RIGHT = 1,
        DOWN = 2,
        UP = 3
    }

// World State
float[,] currentState;
int playerX = 0;
int playerY = 2;

// Model Settings.
NeuralNet neuralNet;
Tensorflow.Keras.Optimizers.OptimizerV2 optimizer;
int num_classes = 4;
int neuronOfHiddenLayer = 15;

// Training parameters.
float learning_rate = 0.001f;
int batch_size = 32;

System.Random random;
string message = "";

public void Start()
{
    random = new System.Random();

    InitState();

    InitModel();

    Train();
}

#region Trainning
private void InitState()
{
    currentState = new float[,]
    {
        {0, 0, 1},
        {0, -1, 0},
        {0, 0, 0}
    };
}

private void InitModel()
{
    neuralNet = new NeuralNet(new NeuralNetArgs
    {
        NumClasses = num_classes,
        NeuronOfHiddenLayer = neuronOfHiddenLayer,
        Activation1 = keras.activations.Relu,
        Activation2 = keras.activations.Softmax,
    });

    optimizer = keras.optimizers.Adam(learning_rate);

    tf.enable_eager_execution();
}

public void Train()
{
    float epsilon = 1.0f;
    List<float[]> states = new List<float[]>();
    List<float[]> rewards = new List<float[]>();
    List<float> reward_mean = new List<float>();
    List<float[]> next_states = new List<float[]>();
    List<float[]> actions = new List<float[]>();

    float[] linearstate = GetLinearState();
    float[] nextLinearState = null;

    string statesLog = "";
    statesLog += LogState(To1DArray(currentState));

    for (int epi = 0; epi < 150; epi++)
    {
        int step = 0;

        playerX = 0;
        playerY = 2;
        visited.Clear();
        visited.Add(new int[] { playerX, playerY });
        bool findplayer = false;

        while (step < 400 && findplayer == false)
        {
            EActions action = TakeAction(linearstate, epsilon);
            float reward = Action(action);
            nextLinearState = GetLinearState();

            statesLog += LogState(To1DArray(currentState));



            float[] mask = { 0, 0, 0, 0 };
            mask[(int)action] = 1;
            int index = random.Next(0, states.Count);
            statesLog += "reward : " + reward + " \n";
            //index = states.Count;
            states.Insert(index, linearstate);
            rewards.Insert(index, new float[] { reward });
            reward_mean.Insert(index, reward);
            next_states.Insert(index, nextLinearState);
            actions.Insert(index, mask);

            if (states.Count > 10000)
            {
                states.RemoveAt(0);
                rewards.RemoveAt(0);
                reward_mean.RemoveAt(0);
                next_states.RemoveAt(0);
                actions.RemoveAt(0);
            }

            linearstate = nextLinearState;
            step++;
            findplayer = playerX == 2 && playerY == 0;
        }

        epsilon = Math.Clamp(epsilon * 0.99f, 0.1f, 1.0f);

        if (epi % 5 == 0)
        {
            message += "---------------\n";
            message += "rewards mean : " + Average(reward_mean.ToArray()) + "\n";
            message += "episode : " + epi + "\n" + "\n";


            TrainModel(To2D(states.ToArray()), To2D(actions.ToArray()), To2D(rewards.ToArray()), To2D(next_states.ToArray()));
        }
    }

    string folder = @"C:\Temp\";
    string fileName = "CSharpCornerAuthors.txt";
    string fullPath = folder + fileName;


    File.WriteAllLines("WriteText.txt", new string[] { statesLog });


    Console.Write(statesLog);
    Console.Write(message);

    message = "------------------ TEST ------------------\n";
    playerX = 0;
    playerY = 2;

    int step2 = 0;
    while (step2 < 400 && !(playerX == 2 && playerY == 0))
    {
        EActions action = TakeAction(linearstate, 0);
        Action(action);
        message += LogState(To1DArray(currentState));
        step2++;
    }

    Console.Write(message);

}

public void TrainModel(float[,] states, float[,] actions, float[,] rewards, float[,] next_states)
{
    Tensor tf_states = tf.convert_to_tensor(states, TF_DataType.TF_FLOAT);
    Tensor tf_rewards = tf.convert_to_tensor(rewards, TF_DataType.TF_FLOAT);
    Tensor tf_next_states = tf.convert_to_tensor(next_states, TF_DataType.TF_FLOAT);
    Tensor tf_actions = tf.convert_to_tensor(actions, TF_DataType.TF_FLOAT);
    List<float> losses = new List<float>();

    int size = (int)tf_next_states.shape.dims[0];

    Tensor Q_stp1 = neuralNet.Apply(tf_next_states, training: true);
    Tensor argmax = tf.cast(tf.max(Q_stp1, 1), TF_DataType.TF_FLOAT);
    Tensor argmaxExpand_dims = tf.expand_dims(argmax, 1);
    Tensor applyScalar = tf.multiply(argmaxExpand_dims, 0.99f);
    Tensor applyRewards = tf.add(applyScalar, tf_rewards);
    int count = (int)applyRewards.shape.dims[0];

    Tensor Qtargets = tf.convert_to_tensor(new NDArray(applyRewards.BufferToArray(), (count, 1), TF_DataType.TF_FLOAT));


    Func<Tensor, Tensor, Tensor, Tensor> model_loss = (tf_states, tf_actions, Qtargets) =>
    {
        Tensor result = neuralNet.Apply(tf_states, training: true);
        Tensor subtract = tf.subtract(result, Qtargets);
        Tensor square = tf.square(subtract);
        Tensor loss = tf.multiply(square, tf_actions);

        return tf.reduce_mean(loss);
    };

    Action<Tensor, Tensor, Tensor> run_optimization = (tf_states, tf_actions, Qtargets) =>
    {
        // Wrap computation inside a GradientTape for automatic differentiation.
        using var g = tf.GradientTape();
        // Forward pass.
        var loss = model_loss(tf_states, tf_actions, Qtargets);
        losses.Add(loss.BufferToArray()[0]);

        // Compute gradients.
        var gradients = g.gradient(loss, neuralNet.trainable_variables);

        // Update W and b following gradients.
        optimizer.apply_gradients(zip(gradients, neuralNet.trainable_variables.Select(x => x as ResourceVariable)));
    };

    for (int b = 0; b < size; b += batch_size)
    {
        var to = (b + batch_size < size) ? b + batch_size : b + (size - b);
        var tf_states_b = tf_states.slice(new Slice(b, to));
        var tf_actions_b = tf_actions.slice(new Slice(b, to));
        var Qtargets_b = Qtargets.slice(new Slice(b, to));

        // Minimize the error
        run_optimization(tf_states_b, tf_actions_b, Qtargets_b);
    }

    message += "Mean loss : " + Average(losses.ToArray()) + "\n\n\n";
}

#endregion

#region Operation Function

public float Sum(params float[] customerssalary)
{
    float result = 0;

    for (int i = 0; i < customerssalary.Length; i++)
    {
        result += customerssalary[i];
    }

    return result;
}

public float Average(params float[] customerssalary)
{
    float sum = Sum(customerssalary);
    float result = (float)sum / customerssalary.Length;
    return result;
}

private T[] To1DArray<T>(T[,] input)
{
    // Step 1: get total size of 2D array, and allocate 1D array.
    int size = input.Length;
    T[] result = new T[size];

    // Step 2: copy 2D array elements into a 1D array.
    int write = 0;
    for (int i = 0; i <= input.GetUpperBound(0); i++)
    {
        for (int z = 0; z <= input.GetUpperBound(1); z++)
        {
            result[write++] = input[i, z];
        }
    }
    // Step 3: return the new array.
    return result;
}

private T[,] To2D<T>(T[][] source)
{
    try
    {
        int FirstDim = source.Length;
        int SecondDim = source.GroupBy(row => row.Length).Single().Key; // throws InvalidOperationException if source is not rectangular

        var result = new T[FirstDim, SecondDim];
        for (int i = 0; i < FirstDim; ++i)
            for (int j = 0; j < SecondDim; ++j)
                result[i, j] = source[i][j];

        return result;
    }
    catch (InvalidOperationException)
    {
        throw new InvalidOperationException("The given jagged array is not rectangular.");
    }
}

#endregion

#region Environement

string LogState(float[] state)
{
    string line = "";
    if (state != null && state.Length > 0)
    {
        line = "---------- STATE LENGTH : " + state.Length + " ----------" + "\n";
        line += "PlayerX : " + playerX + "\n";
        line += "PlayerY : " + playerY + "\n";

        string line2 = "";
        string newLine = "[ ";
        for (int i = 0; i < state.Length; i++)
        {

            if ((i % 3) == (playerX) && i / 3 == playerY)
                newLine += " " + "PLAYER" + " ";
            else
                newLine += " " + state[i].ToString() + " ";

            if ((i + 1) % 3 == 0)
            {
                if (i == state.Length - 1)
                    newLine += "]";

                newLine += "\n";
                line += newLine;
                newLine = "";
            }
        }
    }
    return line;

}

private float[] GetLinearState()
{

    float[,] result = (float[,])currentState.Clone();
    result[playerY, playerX] = 0.5f;

    return To1DArray(result);
}


private float GetReconpense(int x, int y)
{
    return currentState[y, x];
}

EActions TakeAction(float[] state, float epsilon)
{
    Tensor tensor = tf.constant(np.array(state));
    tensor = tf.reshape(tensor, (1, 9));
    EActions act;

    if (random.NextDouble() < epsilon)
    {
        int next = random.Next(0, 4);
        act = (EActions)next;
    }
    else
    {
        Tensor predict = neuralNet.Apply(tensor);
        act = (EActions)tf.arg_max(predict, 1).BufferToArray()[0];
        message += "Action : " + act + " \n";
    }

    return act;
}

List<int[]> visited = new List<int[]>();

float Action(EActions action)
{
    float tmpPlayerX = playerX;
    float tmpPlayerY = playerY;
    switch (action)
    {
        case EActions.LEFT:
            {
                if (playerX != 0)
                    playerX--;
            }
            break;
        case EActions.RIGHT:
            {
                if (playerX != 2)
                    playerX++;
            }
            break;
        case EActions.DOWN:
            {
                if (playerY != 2)
                    playerY++;
            }
            break;
        case EActions.UP:
            {
                if (playerY != 0)
                    playerY--;
            }
            break;
        default:
            break;
    }

    if (tmpPlayerX == playerX && tmpPlayerY == playerY)
        return -0.75f;
    else
    {
        float rec = GetReconpense(playerX, playerY);

        if (rec != -1 && rec != 1)
        {
            if (!visited.Exists(x => x[0] == playerX && x[1] == playerY))
            {
                visited.Add(new int[] { playerX, playerY });
                return 0;
            }
            else
                return -0.25f;
        }
    }
    return GetReconpense(playerX, playerY);
}

#endregion

#region Model Class

public class NeuralNet : Model
{
    Layer fc1;
    Layer output;

    public NeuralNet(NeuralNetArgs args) :
        base(args)
    {
        var layers = keras.layers;

        // First fully-connected hidden layer.
        fc1 = layers.Dense(args.NeuronOfHiddenLayer, activation: args.Activation1);

        output = layers.Dense(args.NumClasses, activation: args.Activation2);

        StackLayers(fc1, output);
    }

    // Set forward pass.
    protected override Tensors Call(Tensors inputs, Tensor state = null, bool? training = null)
    {
        inputs = fc1.Apply(inputs);
        inputs = output.Apply(inputs);
        //if (!training.Value)
        //  inputs = tf.nn.softmax(inputs);
        return inputs;
    }
}

public class NeuralNetArgs : ModelArgs
{
    /// <summary>
    /// 1st layer number of neurons.
    /// </summary>
    public int NeuronOfHiddenLayer { get; set; }
    public Activation Activation1 { get; set; }

    public int NumClasses { get; set; }
    public Activation Activation2 { get; set; }
}

#endregion
}

I受到 thibeau neveau 用 Python 编写的一个在环境中行驶的汽车示例的启发： https://github.com/thibo73800/aihub/blob/master/ rl/q_learning_nn.html

我将我的项目和 exe 压缩到我的 GitHub 中： https://github.com/Inazuma12/TensorFlowMaze

可以帮助我吗？

原文

I am practicing neural networks with TensorFlow and QLearning. For my project I work in C# to be able to migrate later my program on the Unity game engine.
I use the TensorFlow.Net library :
https://github.com/SciSharp/TensorFlow.NET

To begin my goal is to train a neural network to find the arrival point of a maze by avoiding the forbidden zone.

My Environment is an array of int (3*3); The current location is indicated by the X and Y position of the player.

PlayerX : 0

PlayerY : 2

[  0  0  1
 0  -1  0 
 PLAYER  0  0 ]

I have four possible actions. LEFT, RIGHT, UP, DOWN.

I have rewards depending on where I am when I perform an action.

0 : Free
-1 : Forbidden
1 : Arrived
-0.25 : Visited
-0.75 : Unchanged position

My neural network has 9 inputs corresponding to the state of the environment. And 4 outputs corresponding to probabilities designate the best action to take. I have a hidden layer which contains 15 neurons. The activation function is Relu and softmax for the output. And I use an Adam optimizer to train my model.

I execute my program, but when the test phase it still does not arrive found the point of arrival and remained stuck on a box limit and performing the same action in loop.

https://i.sstatic.net/yedn9.png

Trainning Model Code

public void TrainModel(float[,] states, float[,] actions, float[,] rewards, float[,] next_states)
{
    Tensor tf_states = tf.convert_to_tensor(states, TF_DataType.TF_FLOAT);
    Tensor tf_rewards = tf.convert_to_tensor(rewards, TF_DataType.TF_FLOAT);
    Tensor tf_next_states = tf.convert_to_tensor(next_states, TF_DataType.TF_FLOAT);
    Tensor tf_actions = tf.convert_to_tensor(actions, TF_DataType.TF_FLOAT);
    List<float> losses = new List<float>();

    int size = (int)tf_next_states.shape.dims[0];

    Tensor Q_stp1 = neuralNet.Apply(tf_next_states, training: true);
    Tensor argmax = tf.cast(tf.max(Q_stp1, 1), TF_DataType.TF_FLOAT);
    Tensor argmaxExpand_dims = tf.expand_dims(argmax, 1);
    Tensor applyScalar = tf.multiply(argmaxExpand_dims, 0.99f);
    Tensor applyRewards = tf.add(applyScalar, tf_rewards);
    int count = (int)applyRewards.shape.dims[0];

    Tensor Qtargets = tf.convert_to_tensor(new NDArray(applyRewards.BufferToArray(), (count, 1), TF_DataType.TF_FLOAT));


    Func<Tensor, Tensor, Tensor, Tensor> model_loss = (tf_states, tf_actions, Qtargets) =>
    {
        Tensor result = neuralNet.Apply(tf_states, training: true);
        Tensor subtract = tf.subtract(result, Qtargets);
        Tensor square = tf.square(subtract);
        Tensor loss = tf.multiply(square, tf_actions);

        return tf.reduce_mean(loss);
    };

    Action<Tensor, Tensor, Tensor> run_optimization = (tf_states, tf_actions, Qtargets) =>
    {
        // Wrap computation inside a GradientTape for automatic differentiation.
        using var g = tf.GradientTape();
        // Forward pass.
        var loss = model_loss(tf_states, tf_actions, Qtargets);
        losses.Add(loss.BufferToArray()[0]);

        // Compute gradients.
        var gradients = g.gradient(loss, neuralNet.trainable_variables);

        // Update W and b following gradients.
        optimizer.apply_gradients(zip(gradients, neuralNet.trainable_variables.Select(x => x as ResourceVariable)));
    };

    for (int b = 0; b < size; b += batch_size)
    {
        var to = (b + batch_size < size) ? b + batch_size : b + (size - b);
        var tf_states_b = tf_states.slice(new Slice(b, to));
        var tf_actions_b = tf_actions.slice(new Slice(b, to));
        var Qtargets_b = Qtargets.slice(new Slice(b, to));

        // Minimize the error
        run_optimization(tf_states_b, tf_actions_b, Qtargets_b);
    }

    message += "Mean loss : " + Average(losses.ToArray()) + "\n\n\n";
}

Source Code :

MazeNetworkLearning MazeNetworkLearning = new MazeNetworkLearning();
MazeNetworkLearning.Start();

public class MazeNetworkLearning
{
    enum EActions
    {
        LEFT = 0,
        RIGHT = 1,
        DOWN = 2,
        UP = 3
    }

// World State
float[,] currentState;
int playerX = 0;
int playerY = 2;

// Model Settings.
NeuralNet neuralNet;
Tensorflow.Keras.Optimizers.OptimizerV2 optimizer;
int num_classes = 4;
int neuronOfHiddenLayer = 15;

// Training parameters.
float learning_rate = 0.001f;
int batch_size = 32;

System.Random random;
string message = "";

public void Start()
{
    random = new System.Random();

    InitState();

    InitModel();

    Train();
}

#region Trainning
private void InitState()
{
    currentState = new float[,]
    {
        {0, 0, 1},
        {0, -1, 0},
        {0, 0, 0}
    };
}

private void InitModel()
{
    neuralNet = new NeuralNet(new NeuralNetArgs
    {
        NumClasses = num_classes,
        NeuronOfHiddenLayer = neuronOfHiddenLayer,
        Activation1 = keras.activations.Relu,
        Activation2 = keras.activations.Softmax,
    });

    optimizer = keras.optimizers.Adam(learning_rate);

    tf.enable_eager_execution();
}

public void Train()
{
    float epsilon = 1.0f;
    List<float[]> states = new List<float[]>();
    List<float[]> rewards = new List<float[]>();
    List<float> reward_mean = new List<float>();
    List<float[]> next_states = new List<float[]>();
    List<float[]> actions = new List<float[]>();

    float[] linearstate = GetLinearState();
    float[] nextLinearState = null;

    string statesLog = "";
    statesLog += LogState(To1DArray(currentState));

    for (int epi = 0; epi < 150; epi++)
    {
        int step = 0;

        playerX = 0;
        playerY = 2;
        visited.Clear();
        visited.Add(new int[] { playerX, playerY });
        bool findplayer = false;

        while (step < 400 && findplayer == false)
        {
            EActions action = TakeAction(linearstate, epsilon);
            float reward = Action(action);
            nextLinearState = GetLinearState();

            statesLog += LogState(To1DArray(currentState));



            float[] mask = { 0, 0, 0, 0 };
            mask[(int)action] = 1;
            int index = random.Next(0, states.Count);
            statesLog += "reward : " + reward + " \n";
            //index = states.Count;
            states.Insert(index, linearstate);
            rewards.Insert(index, new float[] { reward });
            reward_mean.Insert(index, reward);
            next_states.Insert(index, nextLinearState);
            actions.Insert(index, mask);

            if (states.Count > 10000)
            {
                states.RemoveAt(0);
                rewards.RemoveAt(0);
                reward_mean.RemoveAt(0);
                next_states.RemoveAt(0);
                actions.RemoveAt(0);
            }

            linearstate = nextLinearState;
            step++;
            findplayer = playerX == 2 && playerY == 0;
        }

        epsilon = Math.Clamp(epsilon * 0.99f, 0.1f, 1.0f);

        if (epi % 5 == 0)
        {
            message += "---------------\n";
            message += "rewards mean : " + Average(reward_mean.ToArray()) + "\n";
            message += "episode : " + epi + "\n" + "\n";


            TrainModel(To2D(states.ToArray()), To2D(actions.ToArray()), To2D(rewards.ToArray()), To2D(next_states.ToArray()));
        }
    }

    string folder = @"C:\Temp\";
    string fileName = "CSharpCornerAuthors.txt";
    string fullPath = folder + fileName;


    File.WriteAllLines("WriteText.txt", new string[] { statesLog });


    Console.Write(statesLog);
    Console.Write(message);

    message = "------------------ TEST ------------------\n";
    playerX = 0;
    playerY = 2;

    int step2 = 0;
    while (step2 < 400 && !(playerX == 2 && playerY == 0))
    {
        EActions action = TakeAction(linearstate, 0);
        Action(action);
        message += LogState(To1DArray(currentState));
        step2++;
    }

    Console.Write(message);

}

public void TrainModel(float[,] states, float[,] actions, float[,] rewards, float[,] next_states)
{
    Tensor tf_states = tf.convert_to_tensor(states, TF_DataType.TF_FLOAT);
    Tensor tf_rewards = tf.convert_to_tensor(rewards, TF_DataType.TF_FLOAT);
    Tensor tf_next_states = tf.convert_to_tensor(next_states, TF_DataType.TF_FLOAT);
    Tensor tf_actions = tf.convert_to_tensor(actions, TF_DataType.TF_FLOAT);
    List<float> losses = new List<float>();

    int size = (int)tf_next_states.shape.dims[0];

    Tensor Q_stp1 = neuralNet.Apply(tf_next_states, training: true);
    Tensor argmax = tf.cast(tf.max(Q_stp1, 1), TF_DataType.TF_FLOAT);
    Tensor argmaxExpand_dims = tf.expand_dims(argmax, 1);
    Tensor applyScalar = tf.multiply(argmaxExpand_dims, 0.99f);
    Tensor applyRewards = tf.add(applyScalar, tf_rewards);
    int count = (int)applyRewards.shape.dims[0];

    Tensor Qtargets = tf.convert_to_tensor(new NDArray(applyRewards.BufferToArray(), (count, 1), TF_DataType.TF_FLOAT));


    Func<Tensor, Tensor, Tensor, Tensor> model_loss = (tf_states, tf_actions, Qtargets) =>
    {
        Tensor result = neuralNet.Apply(tf_states, training: true);
        Tensor subtract = tf.subtract(result, Qtargets);
        Tensor square = tf.square(subtract);
        Tensor loss = tf.multiply(square, tf_actions);

        return tf.reduce_mean(loss);
    };

    Action<Tensor, Tensor, Tensor> run_optimization = (tf_states, tf_actions, Qtargets) =>
    {
        // Wrap computation inside a GradientTape for automatic differentiation.
        using var g = tf.GradientTape();
        // Forward pass.
        var loss = model_loss(tf_states, tf_actions, Qtargets);
        losses.Add(loss.BufferToArray()[0]);

        // Compute gradients.
        var gradients = g.gradient(loss, neuralNet.trainable_variables);

        // Update W and b following gradients.
        optimizer.apply_gradients(zip(gradients, neuralNet.trainable_variables.Select(x => x as ResourceVariable)));
    };

    for (int b = 0; b < size; b += batch_size)
    {
        var to = (b + batch_size < size) ? b + batch_size : b + (size - b);
        var tf_states_b = tf_states.slice(new Slice(b, to));
        var tf_actions_b = tf_actions.slice(new Slice(b, to));
        var Qtargets_b = Qtargets.slice(new Slice(b, to));

        // Minimize the error
        run_optimization(tf_states_b, tf_actions_b, Qtargets_b);
    }

    message += "Mean loss : " + Average(losses.ToArray()) + "\n\n\n";
}

#endregion

#region Operation Function

public float Sum(params float[] customerssalary)
{
    float result = 0;

    for (int i = 0; i < customerssalary.Length; i++)
    {
        result += customerssalary[i];
    }

    return result;
}

public float Average(params float[] customerssalary)
{
    float sum = Sum(customerssalary);
    float result = (float)sum / customerssalary.Length;
    return result;
}

private T[] To1DArray<T>(T[,] input)
{
    // Step 1: get total size of 2D array, and allocate 1D array.
    int size = input.Length;
    T[] result = new T[size];

    // Step 2: copy 2D array elements into a 1D array.
    int write = 0;
    for (int i = 0; i <= input.GetUpperBound(0); i++)
    {
        for (int z = 0; z <= input.GetUpperBound(1); z++)
        {
            result[write++] = input[i, z];
        }
    }
    // Step 3: return the new array.
    return result;
}

private T[,] To2D<T>(T[][] source)
{
    try
    {
        int FirstDim = source.Length;
        int SecondDim = source.GroupBy(row => row.Length).Single().Key; // throws InvalidOperationException if source is not rectangular

        var result = new T[FirstDim, SecondDim];
        for (int i = 0; i < FirstDim; ++i)
            for (int j = 0; j < SecondDim; ++j)
                result[i, j] = source[i][j];

        return result;
    }
    catch (InvalidOperationException)
    {
        throw new InvalidOperationException("The given jagged array is not rectangular.");
    }
}

#endregion

#region Environement

string LogState(float[] state)
{
    string line = "";
    if (state != null && state.Length > 0)
    {
        line = "---------- STATE LENGTH : " + state.Length + " ----------" + "\n";
        line += "PlayerX : " + playerX + "\n";
        line += "PlayerY : " + playerY + "\n";

        string line2 = "";
        string newLine = "[ ";
        for (int i = 0; i < state.Length; i++)
        {

            if ((i % 3) == (playerX) && i / 3 == playerY)
                newLine += " " + "PLAYER" + " ";
            else
                newLine += " " + state[i].ToString() + " ";

            if ((i + 1) % 3 == 0)
            {
                if (i == state.Length - 1)
                    newLine += "]";

                newLine += "\n";
                line += newLine;
                newLine = "";
            }
        }
    }
    return line;

}

private float[] GetLinearState()
{

    float[,] result = (float[,])currentState.Clone();
    result[playerY, playerX] = 0.5f;

    return To1DArray(result);
}


private float GetReconpense(int x, int y)
{
    return currentState[y, x];
}

EActions TakeAction(float[] state, float epsilon)
{
    Tensor tensor = tf.constant(np.array(state));
    tensor = tf.reshape(tensor, (1, 9));
    EActions act;

    if (random.NextDouble() < epsilon)
    {
        int next = random.Next(0, 4);
        act = (EActions)next;
    }
    else
    {
        Tensor predict = neuralNet.Apply(tensor);
        act = (EActions)tf.arg_max(predict, 1).BufferToArray()[0];
        message += "Action : " + act + " \n";
    }

    return act;
}

List<int[]> visited = new List<int[]>();

float Action(EActions action)
{
    float tmpPlayerX = playerX;
    float tmpPlayerY = playerY;
    switch (action)
    {
        case EActions.LEFT:
            {
                if (playerX != 0)
                    playerX--;
            }
            break;
        case EActions.RIGHT:
            {
                if (playerX != 2)
                    playerX++;
            }
            break;
        case EActions.DOWN:
            {
                if (playerY != 2)
                    playerY++;
            }
            break;
        case EActions.UP:
            {
                if (playerY != 0)
                    playerY--;
            }
            break;
        default:
            break;
    }

    if (tmpPlayerX == playerX && tmpPlayerY == playerY)
        return -0.75f;
    else
    {
        float rec = GetReconpense(playerX, playerY);

        if (rec != -1 && rec != 1)
        {
            if (!visited.Exists(x => x[0] == playerX && x[1] == playerY))
            {
                visited.Add(new int[] { playerX, playerY });
                return 0;
            }
            else
                return -0.25f;
        }
    }
    return GetReconpense(playerX, playerY);
}

#endregion

#region Model Class

public class NeuralNet : Model
{
    Layer fc1;
    Layer output;

    public NeuralNet(NeuralNetArgs args) :
        base(args)
    {
        var layers = keras.layers;

        // First fully-connected hidden layer.
        fc1 = layers.Dense(args.NeuronOfHiddenLayer, activation: args.Activation1);

        output = layers.Dense(args.NumClasses, activation: args.Activation2);

        StackLayers(fc1, output);
    }

    // Set forward pass.
    protected override Tensors Call(Tensors inputs, Tensor state = null, bool? training = null)
    {
        inputs = fc1.Apply(inputs);
        inputs = output.Apply(inputs);
        //if (!training.Value)
        //  inputs = tf.nn.softmax(inputs);
        return inputs;
    }
}

public class NeuralNetArgs : ModelArgs
{
    /// <summary>
    /// 1st layer number of neurons.
    /// </summary>
    public int NeuronOfHiddenLayer { get; set; }
    public Activation Activation1 { get; set; }

    public int NumClasses { get; set; }
    public Activation Activation2 { get; set; }
}

#endregion
}

I was greatly inspired by an example in python made by thibeau neveau with a car driving on an envirmenent:
https://github.com/thibo73800/aihub/blob/master/rl/q_learning_nn.html

I zip my project and exe in My GitHub :
https://github.com/Inazuma12/TensorFlowMaze

Can help me ?

分享到QQ

分享到微博