Source code for gsnn.optim.RewardScaler

import numpy as np 

[docs]def edw(N, alpha):
    # Generate exponential decay weights
    weights = np.array([(1-alpha) ** i for i in range(N)])
    
    # Normalize weights to sum to 1
    weights /= np.sum(weights)
    
    return weights[::-1]

[docs]class RewardScaler:
    def __init__(self, clip=5, eps=1e-3, alpha=0.04, warmup=3):
        """
        Reward scaling with exponential moving average statistics.

        Scales rewards using a running mean and standard deviation computed with exponential decay weights.
        This helps stabilize training by normalizing the reward distribution.

        Args:
            clip (float, optional): Maximum absolute value for scaled rewards. Default: 5
            eps (float, optional): Small constant for numerical stability. Default: 1e-3
            alpha (float, optional): Decay rate for exponential moving average. Default: 0.04
            warmup (int, optional): Number of updates before using computed statistics. Default: 3

        Example:
            >>> scaler = RewardScaler(clip=5, alpha=0.1)
            >>> for i in range(100):
            ...     reward = get_reward()
            ...     scaler.update(reward)
            ...     scaled_reward = scaler.scale(reward)
        """

        self.rewards = []
        self.eps = eps
        self.clip = clip
        self.alpha = alpha
        self.warmup = warmup

[docs]    def update(self, reward):
        
        if np.isinf(reward) or np.isnan(reward): reward = -1
        self.rewards.append(reward)

[docs]    def get_params(self):
        
        if len(self.rewards) <= self.warmup:
            return 0, 1
        else: 
            weights = edw(len(self.rewards), alpha=self.alpha)
            mu = np.average(self.rewards, weights=weights)

            # Bias-corrected weighted variance
            squared_diffs = (self.rewards - mu) ** 2
            
            std = np.sqrt(np.sum(weights * squared_diffs) / np.sum(weights))

            return mu, std
       
[docs]    def scale(self, reward):

        mu, std = self.get_params()
        
        return np.clip((reward - mu) / (std + self.eps), -self.clip, self.clip)