import numpy as np
[docs]def edw(N, alpha):
# Generate exponential decay weights
weights = np.array([(1-alpha) ** i for i in range(N)])
# Normalize weights to sum to 1
weights /= np.sum(weights)
return weights[::-1]
[docs]class RewardScaler:
def __init__(self, clip=5, eps=1e-3, alpha=0.04, warmup=3):
"""
Reward scaling with exponential moving average statistics.
Scales rewards using a running mean and standard deviation computed with exponential decay weights.
This helps stabilize training by normalizing the reward distribution.
Args:
clip (float, optional): Maximum absolute value for scaled rewards. Default: 5
eps (float, optional): Small constant for numerical stability. Default: 1e-3
alpha (float, optional): Decay rate for exponential moving average. Default: 0.04
warmup (int, optional): Number of updates before using computed statistics. Default: 3
Example:
>>> scaler = RewardScaler(clip=5, alpha=0.1)
>>> for i in range(100):
... reward = get_reward()
... scaler.update(reward)
... scaled_reward = scaler.scale(reward)
"""
self.rewards = []
self.eps = eps
self.clip = clip
self.alpha = alpha
self.warmup = warmup
[docs] def update(self, reward):
if np.isinf(reward) or np.isnan(reward): reward = -1
self.rewards.append(reward)
[docs] def get_params(self):
if len(self.rewards) <= self.warmup:
return 0, 1
else:
weights = edw(len(self.rewards), alpha=self.alpha)
mu = np.average(self.rewards, weights=weights)
# Bias-corrected weighted variance
squared_diffs = (self.rewards - mu) ** 2
std = np.sqrt(np.sum(weights * squared_diffs) / np.sum(weights))
return mu, std
[docs] def scale(self, reward):
mu, std = self.get_params()
return np.clip((reward - mu) / (std + self.eps), -self.clip, self.clip)