Apprentissage par Renforcement : Guide Complet
Découvrez les concepts fondamentaux de l'apprentissage par renforcement, ses algorithmes et applications pratiques.
InSkillCoach
Apprentissage par Renforcement : Guide Complet
L’apprentissage par renforcement est une branche fascinante de l’Intelligence Artificielle qui permet aux agents d’apprendre à interagir avec leur environnement.
1. Concepts Fondamentaux
Environnement Simple
# Exemple d'environnement simple
class SimpleEnv:
def __init__(self):
self.state = 0
self.max_steps = 10
def reset(self):
self.state = 0
return self.state
def step(self, action):
# Action 0: gauche, Action 1: droite
if action == 0:
self.state = max(0, self.state - 1)
else:
self.state = min(10, self.state + 1)
done = self.state == 10
reward = 1 if done else 0
return self.state, reward, done
Agent Q-Learning
# Exemple d'agent Q-Learning
class QAgent:
def __init__(self, state_size, action_size, learning_rate=0.1, gamma=0.95):
self.q_table = np.zeros((state_size, action_size))
self.lr = learning_rate
self.gamma = gamma
def choose_action(self, state, epsilon=0.1):
if np.random.random() < epsilon:
return np.random.randint(self.q_table.shape[1])
return np.argmax(self.q_table[state])
def learn(self, state, action, reward, next_state):
old_value = self.q_table[state, action]
next_max = np.max(self.q_table[next_state])
new_value = (1 - self.lr) * old_value + self.lr * (reward + self.gamma * next_max)
self.q_table[state, action] = new_value
2. Algorithmes Avancés
Deep Q-Network (DQN)
# Exemple d'implémentation de DQN
class DQN(nn.Module):
def __init__(self, input_size, output_size):
super().__init__()
self.fc1 = nn.Linear(input_size, 128)
self.fc2 = nn.Linear(128, 64)
self.fc3 = nn.Linear(64, output_size)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
class DQNAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen=2000)
self.gamma = 0.95
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = 0.995
self.learning_rate = 0.001
self.model = DQN(state_size, action_size)
self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
if np.random.rand() <= self.epsilon:
return random.randrange(self.action_size)
with torch.no_grad():
state = torch.FloatTensor(state).unsqueeze(0)
act_values = self.model(state)
return torch.argmax(act_values[0]).item()
def replay(self, batch_size):
if len(self.memory) < batch_size:
return
minibatch = random.sample(self.memory, batch_size)
states = torch.FloatTensor([i[0] for i in minibatch])
actions = torch.LongTensor([i[1] for i in minibatch])
rewards = torch.FloatTensor([i[2] for i in minibatch])
next_states = torch.FloatTensor([i[3] for i in minibatch])
dones = torch.FloatTensor([i[4] for i in minibatch])
current_q_values = self.model(states).gather(1, actions.unsqueeze(1))
next_q_values = self.model(next_states).max(1)[0].detach()
target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
loss = F.mse_loss(current_q_values.squeeze(), target_q_values)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
Policy Gradient
# Exemple d'implémentation de Policy Gradient
class PolicyNetwork(nn.Module):
def __init__(self, input_size, output_size):
super().__init__()
self.fc1 = nn.Linear(input_size, 128)
self.fc2 = nn.Linear(128, 64)
self.fc3 = nn.Linear(64, output_size)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return F.softmax(self.fc3(x), dim=-1)
class PolicyGradientAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.gamma = 0.99
self.learning_rate = 0.001
self.model = PolicyNetwork(state_size, action_size)
self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
def choose_action(self, state):
state = torch.FloatTensor(state).unsqueeze(0)
probs = self.model(state)
m = Categorical(probs)
action = m.sample()
return action.item(), m.log_prob(action)
def update(self, rewards, log_probs):
returns = []
R = 0
for r in reversed(rewards):
R = r + self.gamma * R
returns.insert(0, R)
returns = torch.FloatTensor(returns)
policy_loss = []
for log_prob, R in zip(log_probs, returns):
policy_loss.append(-log_prob * R)
policy_loss = torch.stack(policy_loss).sum()
self.optimizer.zero_grad()
policy_loss.backward()
self.optimizer.step()
3. Applications Pratiques
Jeu Simple
# Exemple d'application sur un jeu simple
def train_simple_game():
env = SimpleEnv()
agent = QAgent(11, 2) # 11 états (0-10), 2 actions
episodes = 1000
for episode in range(episodes):
state = env.reset()
total_reward = 0
while True:
action = agent.choose_action(state)
next_state, reward, done = env.step(action)
agent.learn(state, action, reward, next_state)
total_reward += reward
state = next_state
if done:
break
print(f"Episode {episode}, Total Reward: {total_reward}")
CartPole avec DQN
# Exemple d'application sur CartPole
def train_cartpole():
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
episodes = 1000
batch_size = 32
for episode in range(episodes):
state = env.reset()
total_reward = 0
while True:
action = agent.act(state)
next_state, reward, done, _ = env.step(action)
agent.remember(state, action, reward, next_state, done)
state = next_state
total_reward += reward
if len(agent.memory) > batch_size:
agent.replay(batch_size)
if done:
print(f"Episode {episode}, Total Reward: {total_reward}")
break
4. Techniques d’Optimisation
Experience Replay
# Exemple d'implémentation de Experience Replay
class ReplayBuffer:
def __init__(self, capacity):
self.buffer = deque(maxlen=capacity)
def push(self, state, action, reward, next_state, done):
self.buffer.append((state, action, reward, next_state, done))
def sample(self, batch_size):
return random.sample(self.buffer, batch_size)
def __len__(self):
return len(self.buffer)
Target Network
# Exemple d'implémentation avec Target Network
class DQNAgentWithTarget:
def __init__(self, state_size, action_size):
self.policy_net = DQN(state_size, action_size)
self.target_net = DQN(state_size, action_size)
self.target_net.load_state_dict(self.policy_net.state_dict())
def update_target_network(self):
self.target_net.load_state_dict(self.policy_net.state_dict())
def get_target_q_values(self, next_states):
with torch.no_grad():
return self.target_net(next_states).max(1)[0]
5. Évaluation et Monitoring
Métriques de Performance
# Exemple d'évaluation des performances
def evaluate_agent(env, agent, episodes=10):
rewards = []
for episode in range(episodes):
state = env.reset()
episode_reward = 0
while True:
action = agent.act(state, epsilon=0) # Pas d'exploration
next_state, reward, done, _ = env.step(action)
episode_reward += reward
state = next_state
if done:
break
rewards.append(episode_reward)
return {
'mean_reward': np.mean(rewards),
'std_reward': np.std(rewards),
'min_reward': np.min(rewards),
'max_reward': np.max(rewards)
}
Visualisation des Résultats
# Exemple de visualisation des résultats
def plot_training_results(rewards_history):
plt.figure(figsize=(10, 5))
plt.plot(rewards_history)
plt.title('Récompenses par Épisode')
plt.xlabel('Épisode')
plt.ylabel('Récompense Totale')
plt.show()
Conclusion
L’apprentissage par renforcement offre des possibilités immenses :
- Jeux vidéo
- Robotique
- Optimisation de processus
- Trading automatique
- Contrôle de systèmes
Points clés à retenir :
- Comprendre les concepts fondamentaux
- Maîtriser les différents algorithmes
- Optimiser les performances
- Évaluer les résultats
- Suivre les bonnes pratiques
À propos de InSkillCoach
Expert en formation et technologies
Coach spécialisé dans les technologies avancées et l'IA, porté par GNeurone Inc.
Certifications:
- AWS Certified Solutions Architect – Professional
- Certifications Google Cloud
- Microsoft Certified: DevOps Engineer Expert
- Certified Kubernetes Administrator (CKA)
- CompTIA Security+
Commentaires
Les commentaires sont alimentés par GitHub Discussions
Connectez-vous avec GitHub pour participer à la discussion