TD3 agents

TD3 based agent

TD3Agent

 TD3Agent (environment_info:ddopai.utils.MDPInfo,
           learning_rate_actor:float=0.0003,
           learning_rate_critic:float|None=None,
           initial_replay_size:int=1024, max_replay_size:int=50000,
           batch_size:int=64, hidden_layers:List=None,
           activation:str='relu', tau:float=0.005, policy_delay:int=2,
           noise_std:float=0.2, sigma_scale:float=0.5, theta:float=0.15,
           dt=0.02, drop_prob:float=0.0, batch_norm:bool=False,
           init_method:str='xavier_uniform', optimizer:str='Adam',
           loss:str='MSE', obsprocessors:list|None=None, device:str='cpu',
           agent_name:str|None='SAC')

XXX

	Type	Default	Details
environment_info	MDPInfo
learning_rate_actor	float	0.0003
learning_rate_critic	float \| None	None	If none, then it is set to learning_rate_actor
initial_replay_size	int	1024
max_replay_size	int	50000
batch_size	int	64
hidden_layers	List	None	if None, then default is [64, 64]
activation	str	relu	“relu”, “sigmoid”, “tanh”, “leakyrelu”, “elu”
tau	float	0.005
policy_delay	int	2
noise_std	float	0.2
sigma_scale	float	0.5
theta	float	0.15
dt	float	0.02
drop_prob	float	0.0
batch_norm	bool	False
init_method	str	xavier_uniform	“xavier_uniform”, “xavier_normal”, “he_normal”, “he_uniform”, “normal”, “uniform”
optimizer	str	Adam	“Adam” or “SGD” or “RMSprop”
loss	str	MSE	currently only MSE is supported
obsprocessors	list \| None	None	default: []
device	str	cpu	“cuda” or “cpu”
agent_name	str \| None	SAC

# #| export

# class TD3Agent():

#     train_mode = "env_interaction"

#     """
#     Soft Actor Critic (SAC) agent with hybrid action, both based on Gaussian. The binary action is 
#     0 if the output of the network is less or equal than 0, and 1 otherwise.

#     Args:
#         mdp_info (MDPInfo): Contains relevant information about the environment.
#         learning_rate_actor (float): Learning rate for the actor.
#         learning_rate_critic (float): Learning rate for the critic.
#         learning_rate_alpha (float): Learning rate for the temperature parameter.
#         initial_replay_size (int): Number of transitions to save in the replay buffer during startup.
#         max_replay_size (int): Maximum number of transitions to save in the replay buffer.
#         batch_size (int): Number of transitions to sample each time experience is replayed.
#         n_features (int): Number of features for the hidden layers of the networks.
#         lr_alpha (float): Learning rate for the temperature parameter.
#         tau (float): Parameter for the soft update of the target networks.
#         optimizer (torch.optim): Optimizer to use for the networks.
#         squeeze_output (bool): Whether to squeeze the output of the actor network or not.
#         use_cuda (bool): Whether to use CUDA or not. If True and not available, it will use CPU.
#         agent_name (str): Name of the agent. If set to None will use some default name.

#     """

#     def __init__(
#             self,
#             environment_info: MDPInfo,
#             learning_rate_actor = 3e-4,
#             learning_rate_critic = None,
#             initial_replay_size = 1024,
#             max_replay_size = 50000,
#             batch_size = 64,
#             hidden_layers = [64, 64],
#             tau = 0.005,
#             policy_delay = 2,
#             noise_std = 0.2,
#             optimizer = optim.Adam,
#             sigma_scale = 0.5,

#             loss = "MSE",

#             theta=0.15,
#             dt=0.02,
#             squeeze_output = True,
#             device = "cuda",
#             agent_name = None): 
        
#         # print("in init fubction")

#         self.warmup_training_steps = initial_replay_size

#         mdp_info = environment_info
#         optimizer = optim.Adam
        
#         self.policy_class = OrnsteinUhlenbeckPolicy
#         self.policy_params = dict(sigma=np.ones(1) * sigma_scale, theta=theta, dt=dt)

#         if len(mdp_info.observation_space.shape) == 2:
#             input_shape = (mdp_info.observation_space.shape[0]*mdp_info.observation_space.shape[1],)
#         else:
#             input_shape = mdp_info.observation_space.shape

#         actor_output_shape = (mdp_info.action_space.shape[0],) 

#         print(input_shape)

#         use_cuda = False

#         if learning_rate_critic is None:
#             learning_rate_critic = learning_rate_actor

#         actor_params = dict(network=MLPActor,
#                                 hidden_layers=hidden_layers,
#                                 input_shape=input_shape,
#                                 output_shape=actor_output_shape,
#                                 use_cuda=use_cuda)
        
#         # print("setting optimizer class")
#         actor_optimizer = {'class': optimizer,
#                     'params': {'lr': learning_rate_actor}} 
        
#         critic_input_shape = (input_shape[0] + actor_output_shape[0],)
#         critic_params = dict(network=MLPStateAction,
#                         optimizer={'class': optimizer,
#                                 'params': {'lr': learning_rate_critic}}, 
#                         loss=F.mse_loss,
#                         hidden_layers=hidden_layers,
#                         input_shape=critic_input_shape,
#                         output_shape=(1,),
#                         squeeze_output=squeeze_output,
#                         use_cuda=use_cuda)
        
#         # print("creating agent from mushroom")
        
#         self.agent = TD3(mdp_info, self.policy_class, self.policy_params,
#                     actor_params, actor_optimizer, critic_params, batch_size,
#                     initial_replay_size, max_replay_size, tau, policy_delay, noise_std)
                
#         self.network_list, self.actor, self.critic = self.get_network_list(set_actor_critic_attributes=True)
    
#         # print("created agent from mushroom")

#         if agent_name is None:
#             self.agent.name = 'TD3_classic'
#         else:
#             self.agent.name = agent_name

#     def __getattr__(self, attr):
#         return getattr(self.agent, attr)

#     def train(self,):
#         self.agent.policy.train()
    
#     def eval(self,):
#         self.agent.policy.eval()

#     def get_network_list(self, set_actor_critic_attributes: bool = True):
#         """ Get the list of networks in the agent for the save and load functions
#         Get the actor for the predict function in eval mode """

#         networks = []
#         ensemble_critic = self.agent._critic_approximator._impl.model
#         for i, model in enumerate(ensemble_critic):
#             networks.append(model.network)
#         networks.append(self.agent.policy._approximator._impl.model.network)

#         actor = self.agent.policy._approximator._impl.model.network
#         critic = ensemble_critic[0].network

#         if set_actor_critic_attributes:
#             return networks, actor, critic
#         else:
#             return networks
        
#     def save(self,
#                 path: str, # The directory where the file will be saved.
#                 overwrite: bool=True): # Allow overwriting; if False, a FileExistsError will be raised if the file exists.
        
#         """
#         Save the PyTorch model to a file in the specified directory.

#         """
        
#         if not hasattr(self, 'network_list') or self.network_list is None:
#             raise AttributeError("Cannot find networks.")

#         # Create the directory path if it does not exist
#         os.makedirs(path, exist_ok=True)

#         # Construct the file path using os.path.join for better cross-platform compatibility

#         for network_number, network in enumerate(self.network_list):
#             full_path = os.path.join(path, f"network_{network_number}.pth")

#             if os.path.exists(full_path):
#                 if not overwrite:
#                     raise FileExistsError(f"The file {full_path} already exists and will not be overwritten.")
#                 else:
#                     logging.debug(f"Overwriting file {full_path}") # Only log with info as during training we will continuously overwrite the model
            
#             # Save the model's state_dict using torch.save
#             torch.save(network.state_dict(), full_path)
#         logging.debug(f"Model saved successfully to {full_path}")
    
#     def load(self, path: str):
#         """
#         Load the PyTorch models from files in the specified directory.
#         """
        
#         if not hasattr(self, 'network_list') or self.network_list is None:
#             raise AttributeError("Cannot find networks to load.")

#         # Check for the presence of model files
#         for network_number, network in enumerate(self.network_list):
#             full_path = os.path.join(path, f"network_{network_number}.pth")

#             if not os.path.exists(full_path):
#                 raise FileNotFoundError(f"The file {full_path} does not exist.")
            
#             try:
#                 # Load each network's state_dict
#                 network.load_state_dict(torch.load(full_path))
#                 logging.info(f"Network {network_number} loaded successfully from {full_path}")
#             except Exception as e:
#                 raise RuntimeError(f"An error occurred while loading network {network_number}: {e}")

from ddopai.envs.inventory.single_period import NewsvendorEnv
from ddopai.dataloaders.tabular import XYDataLoader
from ddopai.experiments.experiment_functions import run_experiment, test_agent

val_index_start = 8000 #90_000
test_index_start = 9000 #100_000

X = np.random.standard_normal((10000, 2))
Y = np.random.standard_normal((10000, 1))
Y += 2*X[:,0].reshape(-1, 1) + 3*X[:,1].reshape(-1, 1)
Y = X[:,0].reshape(-1, 1)
# truncate Y at 0:
Y = np.maximum(Y, 0)
# normalize Y max to 1
Y = Y/np.max(Y)

print(np.max(Y))

print(X.shape, Y.shape)

clip_action = ClipAction(0., 1.)

dataloader = XYDataLoader(X, Y, val_index_start, test_index_start, lag_window_params =  {'lag_window': 0, 'include_y': False, 'pre_calc': True})

environment = NewsvendorEnv(
    dataloader = dataloader,
    underage_cost = 0.42857,
    overage_cost = 1.0,
    gamma = 0.999,
    horizon_train = 365,
    q_bound_high = 1.0,
    q_bound_low = -0.1,
    postprocessors = [clip_action],
)



agent = TD3Agent(environment.mdp_info,
                obsprocessors = None,      # default: []
                device="cpu", # "cuda" or "cpu"
)

environment.test()
agent.eval()

R, J = test_agent(agent, environment)

print(R, J)

environment.train()
agent.train()
environment.print=False

# run_experiment(agent, environment, n_epochs=50, n_steps=1000, run_id = "test", save_best=True, print_freq=1) # fit agent via run_experiment function

environment.test()
agent.eval()

R, J = test_agent(agent, environment)

print(R, J)

1.0
(10000, 2) (10000, 1)

/Users/magnus/miniforge3/envs/inventory_gym_2/lib/python3.11/site-packages/gymnasium/spaces/box.py:130: UserWarning: WARN: Box bound precision lowered by casting to float32
  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
INFO:root:Actor network:
/Users/magnus/miniforge3/envs/inventory_gym_2/lib/python3.11/site-packages/torchinfo/torchinfo.py:462: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  action_fn=lambda data: sys.getsizeof(data.storage()),

Checking tuple: (2,)
==========================================================================================
Layer (type:depth-idx)                   Output Shape              Param #
==========================================================================================
MLPActor                                 [1, 1]                    --
├─Sequential: 1-1                        [1, 1]                    --
│    └─Linear: 2-1                       [1, 64]                   192
│    └─ReLU: 2-2                         [1, 64]                   --
│    └─Dropout: 2-3                      [1, 64]                   --
│    └─Linear: 2-4                       [1, 64]                   4,160
│    └─ReLU: 2-5                         [1, 64]                   --
│    └─Dropout: 2-6                      [1, 64]                   --
│    └─Linear: 2-7                       [1, 1]                    65
│    └─Identity: 2-8                     [1, 1]                    --
==========================================================================================
Total params: 4,417
Trainable params: 4,417
Non-trainable params: 0
Total mult-adds (M): 0.00
==========================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.02
Estimated Total Size (MB): 0.02
==========================================================================================

INFO:root:Critic network:

Checking tuple: (2,)
Checking tuple: (1,)
==========================================================================================
Layer (type:depth-idx)                   Output Shape              Param #
==========================================================================================
MLPStateAction                           --                        --
├─Sequential: 1-1                        [1, 1]                    --
│    └─Linear: 2-1                       [1, 64]                   256
│    └─ReLU: 2-2                         [1, 64]                   --
│    └─Dropout: 2-3                      [1, 64]                   --
│    └─Linear: 2-4                       [1, 64]                   4,160
│    └─ReLU: 2-5                         [1, 64]                   --
│    └─Dropout: 2-6                      [1, 64]                   --
│    └─Linear: 2-7                       [1, 1]                    65
│    └─Identity: 2-8                     [1, 1]                    --
==========================================================================================
Total params: 4,481
Trainable params: 4,481
Non-trainable params: 0
Total mult-adds (M): 0.00
==========================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.02
Estimated Total Size (MB): 0.02
==========================================================================================
-779.2586167634846 -492.39378518242427
-779.2586167634846 -492.39378518242427