Sandya Subramanian
HST (Health Sciences and Technology), 2nd year Ph.D.
This was kind of just for fun. It's the analog to what we did in one of the assignments, where we generated fake Shakespeare text. I wanted to see if I could generate "fake" ECG data.
using CSV, Plots
using Knet, AutoGrad
using Knet: sigm_dot, tanh_dot
plotlyjs()
Knet.gpu(0)
fs_motion = 500;
sample_motion = CSV.read("test22_00j.csv", header = ["Time", "ECG1", "ECG2", "ECG3", "ECG4"], datarow=1, nullable=false)
fs_arrhyth = 360;
sample_arrhyth = CSV.read("207.csv",header = ["Time", "ECG1", "ECG2"], datarow=1, nullable=false)
plot(sample_motion[:,1],sample_motion[:,4],title = "Sample ECG during Jumping with Motion Artifact",xlabel="Time (sec)")
plot(sample_arrhyth[360*515:360*535,1],sample_arrhyth[360*515:360*535,3],title = "Sample ECG during Arrhythmia",xlabel="Time (sec)")
The setup is pretty much exactly the same as the assignment, except with some obvious differences to account for the different data format.
#Task 1: Create dictionary
function createVocabulary(signal)
vocab = Dict{Float64,Int64}()
# List of unique values in signal
chars = unique(signal)
for i = 1:length(chars)
vocab[chars[i]] = i
end
return vocab
end
#LSTM Network Function
function lstm(weight,bias,hidden,cell,input)
gates = hcat(input,hidden) * weight .+ bias
hsize = size(hidden,2) #size of layer
forget = sigm_dot(gates[:,1:hsize]) #Assumed size H+I * 4H
ingate = sigm_dot(gates[:,1+hsize:2hsize])
outgate = sigm_dot(gates[:,1+2hsize:3hsize])
change = tanh_dot(gates[:,1+3hsize:end])
cell = cell .* forget + ingate .* change
hidden = outgate .* tanh_dot(cell)
return (hidden,cell)
end
#Task 2: Create Initial Weights
function initweights(lenhidden, lenvocab, embed, arraytype)
init(d...) = xavier(d...)
bias(d...) = zeros(d...)
tmp_model = Vector{Array{Float64}}(2*length(lenhidden)+3) #2*num layers + 3
X = embed #size of x input
for k = 1:length(lenhidden)
tmp_model[2k-1] = init(X+lenhidden[k],4*lenhidden[k]) #Each element of the Vector is a matrix itself
tmp_model[2k] = bias(1,4*lenhidden[k])
#Combine all the weights
#Biases separately
X = lenhidden[k] #Replace previous layer size
end
tmp_model[end-2] = init(lenvocab,embed) #Size of vocab by size of input x?
tmp_model[end-1] = init(lenhidden[end],lenvocab) #Size of last layer by number of words - for output?
tmp_model[end] = bias(1,lenvocab) #Zero vector for each word?
if arraytype == "KnetArray"
model = Vector{KnetArray{Float64}}(2*length(lenhidden)+3)
for k = 1:length(tmp_model)
model[k] = KnetArray(tmp_model[k]);
println(typeof(model[k]))
end
else
model = copy(tmp_model);
end
return model
end
#Task 3: Create Initial State
function initstate(model, batch)
#Check what type to use
if typeof(model[1]) == Array{Float64,2}
arraytype = "Array"
else
arraytype = "KnetArray"
end
nlayers = div(length(model)-3,2)
state = Vector{Any}(2*nlayers)
for k = 1:nlayers
#Get length of layer
H = div(size(model[2k],2),4)
state[2k-1] = state[2k] = zeros(batch,H)
#cell and hidden for each layer both vectors length H
end
if arraytype == "KnetArray"
for k = 1:length(state)
state[k] = KnetArray(state[k]);
end
end
return state
end
#Task 4: Create Predict function
function predict(model, state, input; pdrop=0)
nlayers = div(length(model)-3,2)
newstate = similar(state)
for k = 1:nlayers
#Run through by selecting the right elements corresponding to that layer
#state[2k-1] -> hidden
#state[2k] -> cell
#model[2k-1] -> weights
#model[2k] -> bias
hidden, cell = lstm(model[2k-1],model[2k],state[2k-1],state[2k],input)
newstate[2k-1] = hidden
newstate[2k] = cell
input = hidden
end
return input,newstate
end
#Generate and Sample Functions
function generate(model, tok2int, nchar)
#tok2int is the dictionary
int2tok = Vector{Float64}(length(tok2int))
for (k,v) in tok2int; int2tok[v] = k; end #Make vector with characters in order of indexing based on dictionary
input = tok2int[0] #Number of 0 - starting input
state = initstate(model, 1)
synth_signal = zeros(0)
for t in 1:nchar
embed = model[end-2][[input],:] #Entire row of input x based on character
ypred,state = predict(model,state,embed)
ypred = ypred * model[end-1] .+ model[end] #To get outputs
input = sample(exp.(logp(ypred))) #Convert to prob dist and sample
synth_signal = append!(synth_signal,int2tok[input])
end
return synth_signal
end
function sample(p)
p = convert(Array,p)
r = rand()
for c = 1:length(p)
r -= p[c]
r < 0 && return c
end
end
function minibatches(signal, tok2int, batch_size)
signal = collect(signal)
if length(signal) > batch_size
nbatch = div(length(signal), batch_size)
else
nbatch = 1
batch_size = length(signal)
end
data = [zeros(Int,batch_size) for i=1:nbatch ]
for n = 1:nbatch
for b = 1:batch_size
sig = signal[(b-1)*nbatch + n]
data[n][b] = tok2int[sig]
end
end
return data
end
#Task 5: Create Loss Function
function loss(model, state, sequence, range=1:length(sequence)-1; newstate=nothing, pdrop=0)
preds = []
for t in range
input = model[end-2][sequence[t],:] #Convert sequence of chars to rows
pred,state = predict(model,state,input; pdrop=pdrop) #pdrop is dropout probability..?
push!(preds,pred)
end
if newstate != nothing
copy!(newstate, map(AutoGrad.getval,state))
end
pred0 = vcat(preds...)
pred1 = dropout(pred0,pdrop)
pred2 = pred1 * model[end-1]
pred3 = pred2 .+ model[end]
logp1 = logp(pred3,2) #Convert to dist
nrows,ncols = size(pred3)
golds = vcat(sequence[range+1]...) #Actual answers - gold standard
index = similar(golds)
@inbounds for i=1:length(golds)
index[i] = i + (golds[i]-1)*nrows
#index[i] = golds[i] + nrows*(i-1)
end
logp2 = logp1[index]
logp3 = sum(logp2)
return -(logp3) / length(golds)
end
function avgloss(model, sequence, S)
T = length(sequence)
B = length(sequence[1])
state = initstate(model, B)
total = count = 0.0
for i in 1:S:T-1 #S is length of smaller sequences
j = min(i+S-1,T-1)
n = j-i+1
total += n * Float64(loss(model, state, sequence, i:j; newstate=state))
count += n
end
return total / count
end
#Task 6: Create Train Function
function train(model, sequence, optim, S, lossgradient; pdrop=0)
T = length(sequence)
B = length(sequence[1])
state = initstate(model, B)
for i in 1:S:T-1
j = min(i+S-1,T-1)
grads = lossgradient(model, state, sequence, i:j; newstate=state, pdrop=pdrop)
update!(model, grads, optim)
end
return model
end
function setup(numepochs,mode,arraytype)
#Options
if mode == "Normal"
datafiles = ["test01_00s.csv","test02_45s.csv","test03_90s.csv","test04_00s.csv","test05_45s.csv","test06_90s.csv","test07_00s.csv","test08_45s.csv","test09_90s.csv","test10_00w.csv","test11_45w.csv","test12_90w.csv","test13_00w.csv","test14_45w.csv","test15_90w.csv","test16_00w.csv","test17_45w.csv","test18_90w.csv","test19_00j.csv","test20_45j.csv","test21_90j.csv","test22_00j.csv","test23_45j.csv","test24_90j.csv","test25_00j.csv","test26_45j.csv","test27_90j.csv"]; # If provided, use first file for training, second for dev, others for test.
elseif mode == "Short"
datafiles = ["207.csv","203.csv","212.csv"]
else
datafiles = ["207.csv","203.csv","212.csv","209.csv","201.csv","202.csv","205.csv","208.csv","210.csv","213.csv"];
end
lenhidden = [128]; # Sizes of one or more LSTM layers.
options1 = Dict{String,Int64}()
options1["togenerate"] = Int64(4000) # If non-zero generate given number of characters.
options1["epochs"] = Int64(numepochs) # Number of epochs for training.
options1["embed"] = Int64(168) # Size of the embedding vector.
options1["batchsize"] = Int64(512) # Number of sequences to train on in parallel
options1["seqlength"] = Int64(20) # Maximum number of steps to unroll the network for bptt. Initial epochs will use the epoch number as bptt length for faster convergence.
options1["seed"] = -1 # Random number seed. -1 or 0 is no fixed seed
options2 = Dict{String,Float64}()
options2["lr"] = 1e-1 # Initial learning rate
options2["gclip"] = 3.0 # Value to clip the gradient norm at.
options2["dpout"] = 0.0 # Dropout probability.
#options1["seed"] > 0 && srand(options1["seed"])
# read text and report lengths
signal = [];
for i = 1:length(datafiles)
sig = CSV.read(datafiles[i], nullable = false);
signal = append!(signal, sig[:,3]);
end
!isempty(signal) && info("Values read: $(length(signal))")
vocab = createVocabulary(signal);
info("$(length(vocab)) unique values.") # The output should be 75 unique chars for input.txt
#Now, let's generate some random text
model = initweights(lenhidden, length(vocab), options1["embed"],arraytype);
firststate = initstate(model,1);
randsignal = generate(model, vocab, options1["togenerate"]) ## change to generate if you want longer sample text
return randsignal, options1, options2, datafiles, lenhidden, signal, vocab, model
end
function getbatches(options1,signal,vocab)
#Now we are ready. First, let's see the intial loss.
data = minibatches(signal, vocab, options1["batchsize"]);
return data
end
function setuptrain(options2, data, model)
# Knet magic
lossgradient = grad(loss,1);
# Print the loss of randomly initialized model.
losses = avgloss(model,data,100);
println((:epoch,0,:loss,losses))
#Below is the training part of RNN (with Adam)
optim = map(x->Adam(lr=options2["lr"], gclip=options2["gclip"]), model);
return lossgradient, optim
end
function trainloop(options1, options2, vocab, data, lossgradient, optim, model)
# MAIN LOOP
for epoch=1:options1["epochs"]
@time train(model, data, optim, min(epoch,options1["seqlength"]), lossgradient; pdrop=options2["dpout"])
# Calculate and print the losses after each epoch
losses = avgloss(model,data,100);
println((:epoch,epoch,:loss,losses))
end
#If you have checked that the loss is decreasing, let's create some text with our model.
state = initstate(model,1);
synth_signal = generate(model, vocab, options1["togenerate"])
return synth_signal
end
I first did it on just very small snippets of normal ECG only. All together there are under 10000 data points for training, which is very little, but that's what the other normal dataset had.
randsignal, options1, options2, datafiles, lenhidden, signal, vocab, model = setup(40,"Normal","KnetArray");
plot(signal)
Here's the random signal before the LSTM was trained.
plot(randsignal,title = "Random Signal")
data = getbatches(options1, signal, vocab);
size(data)
lossgradient, optim = setuptrain(options2, data, model);
synth_signal = trainloop(options1, options2, vocab, data, lossgradient, optim, model);
After 40 epochs of training, here's the final signal. Not bad!
plot(synth_signal,title="Synthetic Signal")
I did it again with a few arrhythmia subjects and only training for very few epochs.
randsignal, options1, options2, datafiles, lenhidden, signal, vocab, model = setup(5,"Short","KnetArray");
plot(randsignal,title = "Random Signal")
data = getbatches(options1, signal, vocab);
size(data)
lossgradient, optim = setuptrain(options2, data, model);
synth_signal = trainloop(options1, options2, vocab, data, lossgradient, optim, model);
Definitely better than before - with only 5 epochs!
plot(synth_signal,title="Synthetic Signal")
Finally, I did it with 10 subjects for 50 epochs - which took several hours to run, even with a GPU.
randsignal, options1, options2, datafiles, lenhidden, signal, vocab, model = setup(50,"Arrhyth","KnetArray");
plot(randsignal,title = "Random Signal")
data = getbatches(options1, signal, vocab);
size(data)
lossgradient, optim = setuptrain(options2, data, model);
synth_signal = trainloop(options1, options2, vocab, data, lossgradient, optim, model);
This isn't going to fool any cardiologist, but it might do for the untrained eye.
plot(synth_signal,title="Synthetic Signal")