联邦学习示例项目：更改结构

2025-04-20 15:19:24 +08:00
parent 1930e1b96b
commit 34a5247dd2
5 changed files with 42 additions and 10 deletions
--- a/fed_example/utils/train_utils.py
+++ b/fed_example/utils/train_utils.py
@@ -0,0 +1,370 @@
+import numpy as np
+import torch
+from tqdm import tqdm
+from sklearn.metrics import roc_auc_score, accuracy_score
+import copy
+import torch.nn.functional as F
+import random
+
+
+def train_deepmodel(device, model, loader, optimizer, criterion, epoch, model_name):
+    model.train()
+    running_loss = 0.0
+    corrects = 0.0
+    alpha = 1
+    beta = 0.1
+    for inputs, labels in tqdm(loader, desc=f'Training {model_name} Epoch {epoch + 1}', unit='batch'):
+        inputs, labels = inputs.float().to(device), labels.to(device)  # 确保数据在 device 上
+        optimizer.zero_grad()
+        
+        outputs, re_img = model(inputs)
+        loss = criterion(outputs.squeeze(), labels.float())
+        loss_F1 = F.l1_loss(re_img, inputs)
+        loss = alpha * loss + beta * loss_F1
+        loss.backward()
+        optimizer.step()
+        
+        running_loss += loss.item()
+    
+    avg_loss = running_loss / len(loader)
+    print(f'{model_name} Training Loss: {avg_loss:.4f}')
+    return avg_loss
+
+
+def validate_deepmodel(device, model, loader, criterion, epoch, model_name):
+    model.eval()
+    running_loss = 0.0
+    correct, total = 0, 0
+    all_labels, all_preds = [], []
+    val_corrects = 0.0
+    alpha = 1
+    beta = 0.1
+    
+    with torch.no_grad():
+        for inputs, labels in tqdm(loader, desc=f'Validating {model_name} Epoch {epoch + 1}', unit='batch'):
+            inputs, labels = inputs.float().to(device), labels.to(device)  # 确保数据在 device 上
+            
+            outputs, re_img = model(inputs)
+            
+            # 将 logits 转换为预测
+            predicted = torch.sigmoid(outputs).data
+            all_preds.extend(predicted.cpu().numpy())
+            all_labels.extend(labels.cpu().numpy())
+            
+            # loss = criterion(outputs.squeeze(), labels.float())
+            loss = criterion(outputs.squeeze(), labels.float())
+            loss_F1 = F.l1_loss(re_img, inputs)
+            loss = alpha * loss + beta * loss_F1
+            running_loss += loss.item()
+    
+    auc = roc_auc_score(all_labels, all_preds)
+    predicted_labels = (np.array(all_preds) >= 0.5).astype(int)  # 确保转换为 NumPy 数组
+    acc = accuracy_score(all_labels, predicted_labels)
+    avg_loss = running_loss / len(loader)
+    print(f'{model_name} Validation Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}, AUC: {auc:.4f}')
+    return avg_loss, acc, auc
+
+
+def test_deepmodel(device, model, loader):
+    model.eval()
+    all_labels, all_preds = [], []
+    
+    with torch.no_grad():
+        for inputs, labels in tqdm(loader, desc=f'Testing', unit='batch'):
+            inputs, labels = inputs.float().to(device), labels.to(device)  # 确保数据在 device 上
+            outputs, re_img = model(inputs)
+            predicted = torch.sigmoid(outputs).data  # 将 logits 转换为预测
+            
+            # 收集预测值和真实标签
+            all_preds.extend(predicted.cpu().numpy())
+            all_labels.extend(labels.cpu().numpy())
+    
+    # 将预测值转换为二值标签
+    predicted_labels = (np.array(all_preds) >= 0.5).astype(int)
+    
+    # 计算准确率和AUC
+    acc = accuracy_score(all_labels, predicted_labels)
+    auc = roc_auc_score(all_labels, all_preds)
+    
+    print(f'Test Accuracy: {acc:.4f}, Test AUC: {auc:.4f}')
+    return acc, auc
+
+
+# def train_model(device, model, loader, optimizer, criterion, epoch, model_name):
+#     model.train()
+#     running_loss = 0.0
+#     for i, (inputs, labels) in enumerate(tqdm(loader, desc=f'Training {model_name} Epoch {epoch + 1}', unit='batch')):
+#         inputs, labels = inputs.float().to(device), labels.float().to(device)  # 确保数据格式正确
+#         optimizer.zero_grad()
+#
+#         outputs = model(inputs)
+#         loss = criterion(outputs.squeeze(), labels)
+#
+#         # 随机打印部分输出和标签，检查格式
+#         if i % 10 == 0:  # 每100个批次打印一次
+#             print(f"Batch {i} - Sample Output: {outputs[0].item():.4f}, Sample Label: {labels[0].item()}")
+#
+#         # 检查损失值是否异常
+#         if loss.item() < 0:
+#             print(f"Warning: Negative loss detected at batch {i}. Loss: {loss.item()}")
+#
+#         loss.backward()
+#         optimizer.step()
+#
+#         running_loss += loss.item()
+#
+#     avg_loss = running_loss / len(loader)
+#     print(f'{model_name} Training Loss: {avg_loss:.4f}')
+#     return avg_loss
+
+def train_model(device, model, loader, optimizer, criterion, epoch, model_name):
+    model.train()
+    running_loss = 0.0
+    for inputs, labels in tqdm(loader, desc=f'Training {model_name} Epoch {epoch + 1}', unit='batch'):
+        inputs, labels = inputs.float().to(device), labels.to(device)  # 确保数据在 device 上
+        optimizer.zero_grad()
+        
+        outputs = model(inputs)
+        loss = criterion(outputs.squeeze(), labels.float())
+        loss.backward()
+        optimizer.step()
+        
+        running_loss += loss.item()
+    
+    avg_loss = running_loss / len(loader)
+    print(f'{model_name} Training Loss: {avg_loss:.4f}')
+    return avg_loss
+
+
+def validate_model(device, model, loader, criterion, epoch, model_name):
+    model.eval()
+    running_loss = 0.0
+    correct, total = 0, 0
+    all_labels, all_preds = [], []
+    
+    with torch.no_grad():
+        for inputs, labels in tqdm(loader, desc=f'Validating {model_name} Epoch {epoch + 1}', unit='batch'):
+            inputs, labels = inputs.float().to(device), labels.to(device)  # 确保数据在 device 上
+            
+            outputs = model(inputs)
+            
+            # 将 logits 转换为预测
+            predicted = torch.sigmoid(outputs).data
+            all_preds.extend(predicted.cpu().numpy())
+            all_labels.extend(labels.cpu().numpy())
+            
+            # loss = criterion(outputs.squeeze(), labels.float())
+            loss = criterion(outputs.squeeze(), labels.float())
+            running_loss += loss.item()
+    auc = roc_auc_score(all_labels, all_preds)
+    predicted_labels = (np.array(all_preds) >= 0.5).astype(int)  # 确保转换为 NumPy 数组
+    acc = accuracy_score(all_labels, predicted_labels)
+    avg_loss = running_loss / len(loader)
+    print(f'{model_name} Validation Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}, AUC: {auc:.4f}')
+    return avg_loss, acc, auc
+
+
+# 权重聚合函数
+def aggregate_weights(weights_list, alpha=1 / 3, beta=1 / 3, gamma=1 / 3):
+    new_state_dict = copy.deepcopy(weights_list[0])  # 从模型a复制权重结构
+    for key in new_state_dict.keys():
+        new_state_dict[key] = (alpha * weights_list[0][key] +
+                               beta * weights_list[1][key] +
+                               gamma * weights_list[2][key])
+    return new_state_dict
+
+
+def v3_update_model_weights(
+        epoch,
+        model_to_update,
+        other_models,
+        global_model,
+        losses,
+        val_loader,
+        device,
+        val_auc_threshold,  # 当前需要更新模型的验证 AUC 阈值
+        validate_model,
+        criterion,
+        update_frequency
+):
+    """
+    根据给定的条件更新模型的权重。
+
+    参数:
+        epoch (int): 当前训练轮次。
+        model_to_update: 需要更新的模型。
+        other_models (list): 其他模型列表，用于计算全局模型权重。
+        global_model: 全局模型。
+        losses (list): 各模型的损失值列表。
+        val_loader: 验证数据的 DataLoader。
+        device: 设备 ('cuda' 或 'cpu')。
+        val_auc_threshold (float): 当前需要更新模型的验证 AUC。
+        aggregate_weights (function): 权重聚合函数。
+        validate_model (function): 验证模型的函数。
+        update_frequency (int): 权重更新的频率。
+
+    返回:
+        val_acc (float): 全局模型的验证精度。
+        val_auc (float): 全局模型的验证 AUC。
+        updated_val_auc_threshold (float): 更新后的验证 AUC。
+    """
+    if (epoch + 1) % update_frequency == 0:
+        # 获取所有模型的权重
+        all_weights = [model.state_dict() for model in other_models]
+        avg_weights = aggregate_weights(all_weights)  # 聚合权重
+        
+        # 更新全局模型权重
+        global_model.load_state_dict(avg_weights)
+        
+        # 计算加权平均损失
+        weighted_loss = sum(loss * 0.33 for loss in losses)
+        print(f"Weighted Average Loss: {weighted_loss:.4f}")
+        
+        # 验证全局模型
+        val_loss, val_acc, val_auc = validate_model(device, global_model, val_loader, criterion, epoch, 'global_model')
+        print(f'global_model Validation Accuracy: {val_acc:.4f}, global_model Validation AUC: {val_auc:.4f}')
+        
+        # 如果全局模型的 AUC 更高，则更新目标模型
+        if val_auc > val_auc_threshold:
+            print(f'Updating model at epoch {epoch + 1}')
+            model_to_update.load_state_dict(global_model.state_dict())
+            val_auc_threshold = val_auc  # 更新 AUC 阈值
+        
+        return val_acc, val_auc, val_auc_threshold
+    return None, None, val_auc_threshold
+
+
+def update_model_weights(
+        epoch,
+        model_to_update,
+        other_models,
+        global_model,
+        losses,
+        val_loader,
+        device,
+        val_auc_threshold,  # 当前需要更新模型的验证 AUC 阈值
+        validate_model,
+        criterion,
+        update_frequency
+):
+    """
+    根据给定的条件更新模型的权重。
+
+    参数:
+        epoch (int): 当前训练轮次。
+        model_to_update: 需要更新的模型。
+        other_models (list): 其他模型列表，用于计算全局模型权重。
+        global_model: 全局模型。
+        losses (list): 各模型的损失值列表。
+        val_loader: 验证数据的 DataLoader。
+        device: 设备 ('cuda' 或 'cpu')。
+        val_auc_threshold (float): 当前需要更新模型的验证 AUC。
+        aggregate_weights (function): 权重聚合函数。
+        validate_model (function): 验证模型的函数。
+        update_frequency (int): 权重更新的频率。
+
+    返回:
+        val_acc (float): 全局模型的验证精度。
+        val_auc (float): 全局模型的验证 AUC。
+        updated_val_auc_threshold (float): 更新后的验证 AUC。
+    """
+    if (epoch + 1) % update_frequency == 0:
+        # 获取所有模型的权重
+        all_weights = [model.state_dict() for model in other_models]
+        avg_weights = aggregate_weights(all_weights)  # 聚合权重
+        
+        # 更新全局模型权重
+        global_model.load_state_dict(avg_weights)
+        
+        # 计算加权平均损失
+        weighted_loss = sum(loss * 0.33 for loss in losses)
+        print(f"Weighted Average Loss: {weighted_loss:.4f}")
+        
+        # 验证全局模型
+        val_loss, val_acc, val_auc = validate_deepmodel(device, global_model, val_loader, criterion, epoch,
+                                                        'global_model')
+        print(f'global_model Validation Accuracy: {val_acc:.4f}, global_model Validation AUC: {val_auc:.4f}')
+        
+        # 如果全局模型的 AUC 更高，则更新目标模型
+        if val_auc > val_auc_threshold:
+            print(f'Updating model at epoch {epoch + 1}')
+            model_to_update.load_state_dict(global_model.state_dict())
+            val_auc_threshold = val_auc  # 更新 AUC 阈值
+        
+        return val_acc, val_auc, val_auc_threshold
+    return None, None, val_auc_threshold
+
+
+def f_update_model_weights(
+        epoch,
+        model_to_update,
+        other_models,
+        global_model,
+        losses,
+        val_loader,
+        device,
+        val_auc_threshold,  # 当前需要更新模型的验证 AUC 阈值
+        aggregate_weights,  # 权重聚合函数
+        validate_model,
+        criterion,
+        update_frequency
+):
+    """
+    根据给定的条件更新模型的权重。
+
+    参数:
+        epoch (int): 当前训练轮次。
+        model_to_update: 需要更新的模型。
+        other_models (list): 其他模型列表，用于计算全局模型权重。
+        global_model: 全局模型。
+        losses (list): 各模型的损失值列表。
+        val_loader: 验证数据的 DataLoader。
+        device: 设备 ('cuda' 或 'cpu')。
+        val_auc_threshold (float): 当前需要更新模型的验证 AUC 阈值。
+        aggregate_weights (function): 权重聚合函数。
+        validate_model (function): 验证模型的函数。
+        criterion: 损失函数。
+        update_frequency (int): 权重更新的频率。
+
+    返回:
+        val_acc (float): 全局模型的验证精度。
+        val_auc (float): 全局模型的验证 AUC。
+        updated_val_auc_threshold (float): 更新后的验证 AUC 阈值。
+    """
+    # 每隔指定的 epoch 更新一次模型权重
+    
+    if (epoch + 1) % update_frequency == 0:
+        print(f"\n[Epoch {epoch + 1}] Updating global model weights...")
+        
+        # 获取其他模型的权重
+        all_weights = [model.state_dict() for model in other_models]
+        
+        # 使用聚合函数计算全局权重
+        avg_weights = aggregate_weights(all_weights)
+        print("Global model weights aggregated.")
+        
+        # 更新全局模型权重
+        global_model.load_state_dict(avg_weights)
+        
+        # 计算加权平均损失
+        weighted_loss = sum(loss * (1 / len(losses)) for loss in losses)  # 平均加权
+        print(f"Weighted Average Loss: {weighted_loss:.4f}")
+        
+        # 验证全局模型性能
+        val_loss, val_acc, val_auc = validate_deepmodel(device, global_model, val_loader, criterion, epoch,
+                                                        'global_model')
+        print(f"[Global Model] Validation Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}, AUC: {val_auc:.4f}")
+        
+        # 如果全局模型 AUC 高于阈值，则更新目标模型权重
+        if val_auc > val_auc_threshold:
+            print(f"Global model AUC improved ({val_auc:.4f} > {val_auc_threshold:.4f}). Updating target model.")
+            model_to_update.load_state_dict(global_model.state_dict())
+            val_auc_threshold = val_auc  # 更新 AUC 阈值
+        else:
+            print(
+                f"Global model AUC did not improve ({val_auc:.4f} <= {val_auc_threshold:.4f}). No update to target model.")
+        
+        return val_acc, val_auc, val_auc_threshold
+    
+    # 如果未到达更新频率，返回当前的 AUC 阈值
+    return None, None, val_auc_threshold