0

我的网络的某些层加载了预训练模型。我想修复他们的参数并训练其他层。如何防止咖啡中的重量更新

我跟着this page并设置lr_multidecay_multi为0,propagate_down: false,甚至base_lr: 0在求解weight_decay: 0。然而,测试损失(每次测试使用所有测试图像)在每个测试中仍然非常缓慢地变化。经过数千iters准确性将达到0(从预装模型加载时的80%)。

这里是AA两层例子我只是初始化权重和上面的参数设置为0。我想feeze该例子的所有层,但训练开始时,损失不断变化的......

layer { 
    name: "data" 
    type: "ImageData" 
    top: "data" 
    top: "label" 
    include { 
     phase: TRAIN 
    } 
    transform_param { 
     scale: 0.017 
     mirror: true 
     crop_size: 32 
     mean_value: 115 
     mean_value: 126 
     mean_value: 130 
     color: true 
     contrast: true 
     brightness: true 
    } 
    image_data_param { 
     source: "/data/zhuhao5/data/cifar100/cifar100_train_replicate.txt" 
     batch_size: 64 
     shuffle: true 
     #pair_size: 3 
    } 
    } 
    layer { 
    name: "data" 
    type: "ImageData" 
    top: "data" 
    top: "label" 
    include { 
     phase: TEST 
    } 
    transform_param { 
     scale: 0.017 
     mirror: false 
     crop_size: 32 
     mean_value: 115 
     mean_value: 126 
     mean_value: 130 
    } 
    image_data_param { 
     source: "/data/zhuhao5/data/cifar100/cifar100_test.txt" 
     batch_size: 100 
     shuffle: false 
    } 
    } 
    #-------------- TEACHER -------------------- 
    layer { 
    name: "conv1" 
    type: "Convolution" 
    bottom: "data" 
    propagate_down: false 
    top: "conv1" 
    param { 
     lr_mult: 0 
     decay_mult: 0 
    } 
    convolution_param { 
     num_output: 16 
     bias_term: false 
     pad: 1 
     kernel_size: 3 
     stride: 1 
     weight_filler { 
     type: "msra" 
     } 
    } 
    } 
    layer { 
    name: "res2_1a_1_bn" 
    type: "BatchNorm" 
    bottom: "conv1" 
    propagate_down: false 
    top: "res2_1a_1_bn" 
    param { 
     lr_mult: 0 
     decay_mult: 0 
    } 
     param { 
     lr_mult: 0 
     decay_mult: 0 
    } 
    } 
    layer { 
    name: "res2_1a_1_scale" 
    type: "Scale" 
    bottom: "res2_1a_1_bn" 
    propagate_down: false 
    top: "res2_1a_1_bn" 
     param { 
     lr_mult: 0 
     decay_mult: 0 
    } 
    scale_param { 
     bias_term: true 
    } 
    } 
    layer { 
    name: "res2_1a_1_relu" 
    type: "ReLU" 
    bottom: "res2_1a_1_bn" 
    propagate_down: false 
    top: "res2_1a_1_bn" 
    } 
    layer { 
    name: "pool_5" 
    type: "Pooling" 
    bottom: "res2_1a_1_bn" 
    propagate_down: false 
    top: "pool_5" 
    pooling_param { 
     pool: AVE 
     global_pooling: true 
    } 
    } 
    layer { 
    name: "fc100" 
    type: "InnerProduct" 
    bottom: "pool_5" 
    propagate_down: false 
    top: "fc100" 
    param { 
     lr_mult: 0 
     decay_mult: 0 
    } 
    param { 
     lr_mult: 0 
     decay_mult: 0 
    } 
    inner_product_param { 
     num_output: 100 
     weight_filler { 
     type: "msra" 
     } 
     bias_filler { 
     type: "constant" 
     value: 0 
     } 
    } 
    } 
    #--------------------------------- 
    layer { 
    name: "tea_soft_loss" 
    type: "SoftmaxWithLoss" 
    bottom: "fc100" 
    bottom: "label" 
    propagate_down: false 
    propagate_down: false 
    top: "tea_soft_loss" 
    loss_weight: 0 
    } 

    ##----------- ACCURACY---------------- 

    layer { 
    name: "teacher_accuracy" 
    type: "Accuracy" 
    bottom: "fc100" 
    bottom: "label" 
    top: "teacher_accuracy" 
    accuracy_param { 
     top_k: 1 
    } 
    } 

这里是求解:

test_iter: 100 

test_interval: 10 

base_lr: 0 
momentum: 0 
weight_decay: 0 

lr_policy: "poly" 
power: 1 

display: 10000 

max_iter: 80000 

snapshot: 5000 

type: "SGD" 

solver_mode: GPU 

random_seed: 10086 

和日志:

I0829 16:31:39.363433 14986 net.cpp:200] teacher_accuracy does not need backward computation. 
I0829 16:31:39.363438 14986 net.cpp:200] tea_soft_loss does not need backward computation. 
I0829 16:31:39.363442 14986 net.cpp:200] fc100_fc100_0_split does not need backward computation. 
I0829 16:31:39.363446 14986 net.cpp:200] fc100 does not need backward computation. 
I0829 16:31:39.363451 14986 net.cpp:200] pool_5 does not need backward computation. 
I0829 16:31:39.363454 14986 net.cpp:200] res2_1a_1_relu does not need backward computation. 
I0829 16:31:39.363458 14986 net.cpp:200] res2_1a_1_scale does not need backward computation. 
I0829 16:31:39.363462 14986 net.cpp:200] res2_1a_1_bn does not need backward computation. 
I0829 16:31:39.363466 14986 net.cpp:200] conv1 does not need backward computation. 
I0829 16:31:39.363471 14986 net.cpp:200] label_data_1_split does not need backward computation. 
I0829 16:31:39.363485 14986 net.cpp:200] data does not need backward computation. 
I0829 16:31:39.363490 14986 net.cpp:242] This network produces output tea_soft_loss 
I0829 16:31:39.363494 14986 net.cpp:242] This network produces output teacher_accuracy 
I0829 16:31:39.363507 14986 net.cpp:255] Network initialization done. 
I0829 16:31:39.363559 14986 solver.cpp:56] Solver scaffolding done. 
I0829 16:31:39.363852 14986 caffe.cpp:248] Starting Optimization 
I0829 16:31:39.363862 14986 solver.cpp:272] Solving WRN_22_12_to_WRN_18_4_v5_net 
I0829 16:31:39.363865 14986 solver.cpp:273] Learning Rate Policy: poly 
I0829 16:31:39.365981 14986 solver.cpp:330] Iteration 0, Testing net (#0) 
I0829 16:31:39.366190 14986 blocking_queue.cpp:49] Waiting for data 
I0829 16:31:39.742347 14986 solver.cpp:397]  Test net output #0: tea_soft_loss = 85.9064 
I0829 16:31:39.742437 14986 solver.cpp:397]  Test net output #1: teacher_accuracy = 0.0113 
I0829 16:31:39.749806 14986 solver.cpp:218] Iteration 0 (0 iter/s, 0.385886s/10000 iters), loss = 0 
I0829 16:31:39.749862 14986 solver.cpp:237]  Train net output #0: tea_soft_loss = 4.97483 
I0829 16:31:39.749877 14986 solver.cpp:237]  Train net output #1: teacher_accuracy = 0 
I0829 16:31:39.749908 14986 sgd_solver.cpp:105] Iteration 0, lr = 0 
I0829 16:31:39.794306 14986 solver.cpp:330] Iteration 10, Testing net (#0) 
I0829 16:31:40.171447 14986 solver.cpp:397]  Test net output #0: tea_soft_loss = 4.9119 
I0829 16:31:40.171510 14986 solver.cpp:397]  Test net output #1: teacher_accuracy = 0.0115 
I0829 16:31:40.219133 14986 solver.cpp:330] Iteration 20, Testing net (#0) 
I0829 16:31:40.596911 14986 solver.cpp:397]  Test net output #0: tea_soft_loss = 4.91862 
I0829 16:31:40.596971 14986 solver.cpp:397]  Test net output #1: teacher_accuracy = 0.0116 
I0829 16:31:40.645246 14986 solver.cpp:330] Iteration 30, Testing net (#0) 
I0829 16:31:41.021711 14986 solver.cpp:397]  Test net output #0: tea_soft_loss = 4.92105 
I0829 16:31:41.021772 14986 solver.cpp:397]  Test net output #1: teacher_accuracy = 0.0117 
I0829 16:31:41.069464 14986 solver.cpp:330] Iteration 40, Testing net (#0) 
I0829 16:31:41.447345 14986 solver.cpp:397]  Test net output #0: tea_soft_loss = 4.91916 
I0829 16:31:41.447407 14986 solver.cpp:397]  Test net output #1: teacher_accuracy = 0.0117 
I0829 16:31:41.495157 14986 solver.cpp:330] Iteration 50, Testing net (#0) 
I0829 16:31:41.905607 14986 solver.cpp:397]  Test net output #0: tea_soft_loss = 4.9208 
I0829 16:31:41.905654 14986 solver.cpp:397]  Test net output #1: teacher_accuracy = 0.0117 
I0829 16:31:41.952659 14986 solver.cpp:330] Iteration 60, Testing net (#0) 
I0829 16:31:42.327942 14986 solver.cpp:397]  Test net output #0: tea_soft_loss = 4.91936 
I0829 16:31:42.328025 14986 solver.cpp:397]  Test net output #1: teacher_accuracy = 0.0117 
I0829 16:31:42.374279 14986 solver.cpp:330] Iteration 70, Testing net (#0) 
I0829 16:31:42.761359 14986 solver.cpp:397]  Test net output #0: tea_soft_loss = 4.91859 
I0829 16:31:42.761430 14986 solver.cpp:397]  Test net output #1: teacher_accuracy = 0.0117 
I0829 16:31:42.807821 14986 solver.cpp:330] Iteration 80, Testing net (#0) 
I0829 16:31:43.232321 14986 solver.cpp:397]  Test net output #0: tea_soft_loss = 4.91668 
I0829 16:31:43.232398 14986 solver.cpp:397]  Test net output #1: teacher_accuracy = 0.0117 
I0829 16:31:43.266436 14986 solver.cpp:330] Iteration 90, Testing net (#0) 
I0829 16:31:43.514633 14986 blocking_queue.cpp:49] Waiting for data 
I0829 16:31:43.638617 14986 solver.cpp:397]  Test net output #0: tea_soft_loss = 4.91836 
I0829 16:31:43.638684 14986 solver.cpp:397]  Test net output #1: teacher_accuracy = 0.0117 
I0829 16:31:43.685451 14986 solver.cpp:330] Iteration 100, Testing net (#0) 

我不知道我MI在caffe更新过程中出现:(

回答

1

找到原因。

BatchNorm图层在训练和测试阶段使用不同的use_global_stats

在我的问题中,我应该在训练过程中设置use_global_stats: true

而且还不要忘记Scale层。

订正层应该是

layer { 
    name: "res2_1a_1_bn" 
    type: "BatchNorm" 
    bottom: "conv1" 
    top: "res2_1a_1_bn" 
    batch_norm_param { 
     use_global_stats: true 
    } 
} 
layer { 
    name: "res2_1a_1_scale" 
    type: "Scale" 
    bottom: "res2_1a_1_bn" 
    top: "res2_1a_1_bn" 
    param { 
    lr_mult: 0 
    decay_mult: 0 
    } 
    param { 
    lr_mult: 0 
    decay_mult: 0 
    } 
    scale_param { 
    bias_term: true 
    } 
}