In [1]:
using Plots; plotlyjs();
using Mocha;

Configuring Mocha...
 * CUDA       disabled by default
 * Native Ext disabled by default
Mocha configured, continue loading module...




DefaultBackend = Mocha.CPUBackend


In [2]:
# coffee break to save results
type Recorder <: Coffee
    loss::Vector{Float64};
    movingLoss::Vector{Float64};
end

In [3]:
# Initialize Solvers with max iterations maxi
maxi = 1000; 

adam=Adam();
paramsAdam = make_solver_parameters(adam, stepsize=0.001,
        beta1=0.999,
        beta2=0.999,
        epsilon=1e-3,
        max_iter=maxi);

adagrad = Adagrad();
paramsAdagrad = make_solver_parameters(adagrad,
        max_iter=maxi,
        gamma=0.3, 
        epsilon=1e-5);

adadelta = Adadelta();
paramsAdadelta = make_solver_parameters(adadelta,
        max_iter=maxi,
        rho=0.98,eps=1e-6);

sgd=SGD();
paramsSGD = make_solver_parameters(sgd,
        max_iter=maxi,
        lr_policy=LRPolicy.Fixed(0.01));

paramsMom = make_solver_parameters(sgd,
        max_iter=maxi,
        mom_policy=MomPolicy.Fixed(0.2),
        lr_policy=LRPolicy.Fixed(0.02));

INSIDE make_solver_parameters of SGD
INSIDE make_solver_parameters of SGD


In [4]:
function runMNIST(optimizer::Mocha.SolverMethod , params::Dict , r::Recorder)
    
    srand(12345678)

    data_layer  = AsyncHDF5DataLayer(name="train-data", source="data/train.txt", batch_size=200, shuffle=true)
    conv_layer  = ConvolutionLayer(name="conv1", n_filter=20, kernel=(5,5), bottoms=[:data], tops=[:conv])
    pool_layer  = PoolingLayer(name="pool1", kernel=(2,2), stride=(2,2), bottoms=[:conv], tops=[:pool])
    conv2_layer = ConvolutionLayer(name="conv2", n_filter=50, kernel=(5,5), bottoms=[:pool], tops=[:conv2])
    pool2_layer = PoolingLayer(name="pool2", kernel=(2,2), stride=(2,2), bottoms=[:conv2], tops=[:pool2])
    fc1_layer   = InnerProductLayer(name="ip1", output_dim=500, neuron=Neurons.ReLU(), bottoms=[:pool2], tops=[:ip1])
    fc2_layer   = InnerProductLayer(name="ip2", output_dim=10, bottoms=[:ip1], tops=[:ip2])
    loss_layer  = SoftmaxLossLayer(name="loss", bottoms=[:ip2,:label])

    backend = DefaultBackend()
    init(backend)

    common_layers = [conv_layer, pool_layer, conv2_layer, pool2_layer, fc1_layer, fc2_layer]
    net = Net("MNIST-train", backend, [data_layer, common_layers..., loss_layer])

    exp_dir = "snapshots-$(Mocha.default_backend_type)"

    data_layer_test = HDF5DataLayer(name="test-data", source="data/test.txt", batch_size=200)
    acc_layer = AccuracyLayer(name="test-accuracy", bottoms=[:ip2, :label])
    test_net = Net("MNIST-test", backend, [data_layer_test, common_layers..., acc_layer])
    validate_performance = ValidationPerformance(test_net)
    lr_policy = LRPolicy.DecayOnValidation(1.e-2, "test-accuracy-accuracy", 0.5)
 

    solver = Solver(optimizer, params)
    add_coffee_break(solver, TrainingSummary(), every_n_iter=25)


    function Mocha.enjoy(lounge::CoffeeLounge, coffee::Recorder, net::Net, state::SolverState)
        push!(coffee.loss, state.obj_val);
        A=0.6;
        push!(coffee.movingLoss, A*coffee.loss[max(1,length(coffee.loss)-1)] + (1-A)*state.obj_val);
        #println(state.obj_val)
    end
    r = Recorder([],[]);
    add_coffee_break(solver, r, every_n_iter=1)

    solve(solver, net)

    destroy(net);
    
    return r
end

runMNIST (generic function with 1 method)

In [5]:
r_sgd = runMNIST(sgd, paramsSGD, Recorder([],[]));
r_mom = runMNIST(sgd, paramsMom, Recorder([],[]));
r_adagrad = runMNIST(adagrad, paramsAdagrad, Recorder([],[]));
r_adadelta = runMNIST(adadelta, paramsAdadelta, Recorder([],[]));
r_adam = runMNIST(adam, paramsAdam, Recorder([],[]));

21-Oct 01:37:37:INFO:root:Constructing net MNIST-train on Mocha.CPUBackend...
21-Oct 01:37:37:INFO:root:Topological sorting 8 layers...
21-Oct 01:37:37:INFO:root:Setup layers...
21-Oct 01:37:39:INFO:root:Network constructed!
21-Oct 01:37:40:INFO:root:Constructing net MNIST-test on Mocha.CPUBackend...
21-Oct 01:37:40:INFO:root:Topological sorting 8 layers...
21-Oct 01:37:40:INFO:root:Setup layers...
21-Oct 01:37:40:DEBUG:root:ConvolutionLayer(conv1): sharing filters and bias
21-Oct 01:37:40:DEBUG:root:ConvolutionLayer(conv2): sharing filters and bias
21-Oct 01:37:40:DEBUG:root:InnerProductLayer(ip1): sharing weights and bias
21-Oct 01:37:40:DEBUG:root:InnerProductLayer(ip2): sharing weights and bias
21-Oct 01:37:40:INFO:root:Network constructed!
21-Oct 01:37:40:DEBUG:root:#DEBUG Checking network topology for back-propagation
21-Oct 01:37:41:DEBUG:root:Init network MNIST-train
21-Oct 01:37:41:DEBUG:root:Init parameter filter for layer conv1
21-Oct 01:37:41:DEBUG:root:Init parameter bias 

In [None]:
length(r_sgd.loss)

In [6]:
plot(r_sgd.loss, xaxis=("Iteration",:log), yaxis = ("Loss",:log),lab="SGD")
plot!(r_mom.loss,lab="Momentum")
plot!(r_adam.loss,lab="Adam (Google)")
plot!(r_adagrad.loss,lab="Adagrad (Amini)")
plot!(r_adadelta.loss,lab="Adadelta (Amini)")

[Plots.jl] Initializing backend: plotlyjs


In [None]:
plot(r_sgd.movingLoss, xaxis=("Iteration",:log), yaxis = ("Loss",:log),lab="SGD")
plot!(r_mom.movingLoss,lab="Momentum")
plot!(r_adam.movingLoss,lab="Adam (Google)")
plot!(r_adagrad.movingLoss,lab="Adagrad (Amini)")
plot!(r_adadelta.movingLoss,lab="Adadelta (Amini)")

In [11]:
records = [r_sgd, r_mom, r_adam, r_adagrad, r_adadelta];
A=zeros(maxi+1, length(records));

alpha=0.8;
for i=1:maxi+1
    for j=1:length(records)
        if i==1
            A[i,j] = records[j].loss[i];
        else
            A[i,j] = alpha*A[i-1,j] + (1-alpha)*records[j].loss[i]
        end
    end
end

In [12]:
plot(A, xaxis=("Iteration",:log), yaxis = ("Loss",:log),)

In [None]:
length(r_sgd.loss)

In [13]:
print(A)

[2.36983060836792 2.36983060836792 2.36983060836792 2.36983060836792 2.36983060836792
 2.362575149536133 2.3560163974761963 2.336858129501343 2.3388720512390138 2.321275520324707
 2.353039531707764 2.3365618228912353 2.2873410987854004 2.2668083715438843 2.2371545219421387
 2.3481137981414797 2.3179665756225587 2.220213636398315 2.158753952026367 2.244267780303955
 2.3450477783203127 2.296569534301758 2.13057869644165 2.019298045349121 2.276529470062256
 2.328243689880371 2.2597634857177735 2.0090807674026485 1.8452165591812133 2.098348839263916
 2.316288580215454 2.231051545562744 1.8992383716659544 1.7095250024871826 1.8816165790405273
 2.301699344091797 2.2024566977996827 1.7758378456558226 1.6049292909683228 1.6666460407135009
 2.284546425957031 2.1637122511688234 1.6329382714053344 1.4855644406939699 1.4871973572061157
 2.2653994965700686 2.1226865434506226 1.4930934950844725 1.3565668584557498 1.3044173978078917
 2.2532459106563234 2.087431175220703 1.361769799004895 1.2280254222