Untitled

From Putrid Parakeet, 2 Years ago, written in Plain Text.

Embed

Download Paste or View Raw
Hits: 107

import gym

import numpy as np

# Khởi tạo môi trường CartPole

env = gym.make('CartPole-v1')

# Khởi tạo bảng Q với giá trị ban đầu là 0

Q = np.zeros([env.observation_space.shape[0], env.action_space.n])

# Thiết lập các siêu tham số

alpha = 0.1 # Tốc độ học

gamma = 0.99 # Hệ số giảm dần

epsilon = 1.0 # Xác suất ngẫu nhiên ban đầu

epsilon_min = 0.01 # Xác suất ngẫu nhiên tối thiểu

epsilon_decay = 0.99 # Hệ số giảm dần xác suất ngẫu nhiên

# Thiết lập số vòng lặp tối đa và điều kiện dừng

num_episodes = 1000

max_steps_per_episode = 500

# Vòng lặp huấn luyện

for i_episode in range(num_episodes):

# Khởi tạo trạng thái ban đầu

state = env.reset()

total_reward = 0

for t in range(max_steps_per_episode):

# Chọn hành động tại trạng thái hiện tại s

if np.random.rand() < epsilon:

action = env.action_space.sample() # Chọn hành động ngẫu nhiên với xác suất epsilon

else:

action = np.argmax(Q[state, :]) # Chọn hành động tốt nhất tại trạng thái hiện tại s

# Thực hiện hành động a tại trạng thái s, nhận được phần thưởng R và chuyển sang trạng thái mới s'

next_state, reward, done, info = env.step(action)

# Cập nhật giá trị Q(s,a) tại trạng thái hiện tại và hành động vừa thực hiện

Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])

# Cập nhật trạng thái hiện tại

state = next_state

total_reward += reward

if done:

break

# Giảm dần xác suất ngẫu nhiên để dần trở thành hành động tốt nhất

epsilon = max(epsilon_min, epsilon * epsilon_decay)

# In kết quả sau mỗi vòng lặp

print("Episode {}: Total reward = {}, Epsilon = {:.2f}".format(i_episode, total_reward, epsilon))

Author

Title

Language

Your paste - Paste your paste here

import gym
import numpy as np

# Khởi tạo môi trường CartPole
env = gym.make('CartPole-v1')

# Khởi tạo bảng Q với giá trị ban đầu là 0
Q = np.zeros([env.observation_space.shape[0], env.action_space.n])

# Thiết lập các siêu tham số
alpha = 0.1  # Tốc độ học
gamma = 0.99  # Hệ số giảm dần
epsilon = 1.0  # Xác suất ngẫu nhiên ban đầu
epsilon_min = 0.01  # Xác suất ngẫu nhiên tối thiểu
epsilon_decay = 0.99  # Hệ số giảm dần xác suất ngẫu nhiên

# Thiết lập số vòng lặp tối đa và điều kiện dừng
num_episodes = 1000
max_steps_per_episode = 500

# Vòng lặp huấn luyện
for i_episode in range(num_episodes):
    # Khởi tạo trạng thái ban đầu
    state = env.reset()
    total_reward = 0
    for t in range(max_steps_per_episode):
        # Chọn hành động tại trạng thái hiện tại s
        if np.random.rand() &lt; epsilon:
            action = env.action_space.sample()  # Chọn hành động ngẫu nhiên với xác suất epsilon
        else:
            action = np.argmax(Q[state, :])  # Chọn hành động tốt nhất tại trạng thái hiện tại s

# Thực hiện hành động a tại trạng thái s, nhận được phần thưởng R và chuyển sang trạng thái mới s'
        next_state, reward, done, info = env.step(action)

# Cập nhật giá trị Q(s,a) tại trạng thái hiện tại và hành động vừa thực hiện
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])

# Cập nhật trạng thái hiện tại
        state = next_state
        total_reward += reward

if done:
            break

# Giảm dần xác suất ngẫu nhiên để dần trở thành hành động tốt nhất
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

# In kết quả sau mỗi vòng lặp
    print(&quot;Episode {}: Total reward = {}, Epsilon = {:.2f}&quot;.format(i_episode, total_reward, epsilon))

Private - Private paste aren't shown in recent listings.

Delete After - When should we delete your paste?

Spam protection -

{"html5":"htmlmixed","css":"css","javascript":"javascript","php":"php","python":"python","ruby":"ruby","lua":"text\/x-lua","bash":"text\/x-sh","go":"go","c":"text\/x-csrc","cpp":"text\/x-c++src","diff":"diff","latex":"stex","sql":"sql","xml":"xml","apl":"apl","asterisk":"asterisk","c_loadrunner":"text\/x-csrc","c_mac":"text\/x-csrc","coffeescript":"text\/x-coffeescript","csharp":"text\/x-csharp","d":"d","ecmascript":"javascript","erlang":"erlang","groovy":"text\/x-groovy","haskell":"text\/x-haskell","haxe":"text\/x-haxe","html4strict":"htmlmixed","java":"text\/x-java","java5":"text\/x-java","jquery":"javascript","mirc":"mirc","mysql":"sql","ocaml":"text\/x-ocaml","pascal":"text\/x-pascal","perl":"perl","perl6":"perl","plsql":"sql","properties":"text\/x-properties","q":"text\/x-q","scala":"scala","scheme":"text\/x-scheme","tcl":"text\/x-tcl","vb":"text\/x-vb","verilog":"text\/x-verilog","yaml":"text\/x-yaml","z80":"text\/x-z80"}

Reply to "Untitled"