### Non-linear least squares

# define non-linear function
f = function(x,theta) theta[1] + theta[2]*exp(-theta[3]*x)

# generate data
set.seed(42)
theta0 = c(1,1,2) # true parameter values to be estimated
n = 100
x = runif(n,0,5)
y = f(x, theta0) + rnorm(n, sd = 0.1)

## plot function and generated data
plot(x,y)
curve(f(x, theta0), from=0, to=5, lwd=2, col="#332288", add=TRUE)
legend("topright", legend="Truth", lwd=2, col='#332288', bty='n')


# calculate (analytical) gradient as function of x and theta. 
# Note: in linear models, the gradient is only a function of x!
dtheta1 = function(x,theta) 1
dtheta2 = function(x,theta) exp(-theta[3]*x)
dtheta3 = function(x,theta) -theta[2]*x*exp(-theta[3]*x)
dtheta  = function(x,theta) cbind(dtheta1(x,theta),dtheta2(x,theta),dtheta3(x,theta))

# calculate numerical gradient
## compute finite-difference gradient (c.f., nlme::fdHess)
fdGrad <- function (pars, fun, ...,
                    .relStep = (.Machine$double.eps)^(1/2), 
                    minAbsPar = 0) {
  ##pars <- as.numeric(pars)
  npar <- length(pars)
  incr <- ifelse(abs(pars) <= minAbsPar, .relStep, 
                 (abs(pars)-minAbsPar) * .relStep)
  ival <- do.call(fun, list(pars, ...))
  diff <- rep(0,npar)
  sapply(1:npar, function(i) {
    del <- rep(0,npar)
    del[i] <- incr[i]
    (do.call(fun, list(pars+del, ...))-ival)/incr[i]
  })
}

ndtheta = function(x, theta)
  fdGrad(theta, function(pars) f(x, pars))

# function to solve for theta hat using relative change in sum of squares as convergence criteria
solve_nlls = function(fun, x, y, theta, dtheta, tol = 1e-5){
  ss.theta = sum((y - fun(x,theta))^2)
  ss.theta.hat = ss.theta *100 # initial value to ensure that following code is run
  
  while(abs((ss.theta - ss.theta.hat)/ss.theta) > tol){
    ss.theta = sum((y - fun(x,theta))^2)
    J = dtheta(x, theta)
    theta.hat = theta + MASS::ginv(t(J)%*%J) %*% t(J) %*% (y - fun(x,theta))
    ss.theta.hat = sum((y - fun(x,theta.hat))^2)
    theta = theta.hat 
  }
  return(theta)
}

theta.hat = solve_nlls(fun = f,x,y,theta = c(1,1,3), dtheta)
theta.hat2 = solve_nlls(fun = f,x,y,theta = c(1,1,3), ndtheta)


# estimate for error term
sigma.hat = sqrt(mean((y-f(x,theta.hat))^2))

# calculate the variance/covariance matrix of the theta estimates
J = dtheta(x,theta.hat)
Sigma.mat = solve(t(J)%*%J) * sigma.hat^2


# plot estimate and 95% CI
plot(x,y)
curve(f(x, theta0), from=0, to=5, lwd=2, col="#332288", add=TRUE)
x.seq = seq(min(x),max(x),length.out=100)
lines(x.seq, f(x.seq, theta.hat), lwd = 2, col='#882255')
legend("topright", legend=c("Truth","Estimate"), lwd=2, col=c('#332288','#882255'), bty='n')
# use the delta method to find an approximate 95% CI on the mean
grd = dtheta(x.seq, theta.hat)
est.se = sqrt(diag(grd %*% Sigma.mat %*% t(grd)))
lines(x.seq, f(x.seq, theta.hat) + 1.96*est.se, col = '#882255')
lines(x.seq, f(x.seq, theta.hat) - 1.96*est.se, col = '#882255')

# plot gradient (sensitivity matrix)
d.seq <- dtheta(x.seq, theta.hat)
plot(range(x.seq), range(d.seq),
     xlab="x", ylab="dtheta", type="n")
for(i in 1:ncol(d.seq))
  lines(x.seq, d.seq[,i], col=i)
lines(x.seq, x.seq*0, lty=3)
legend("topright", c("theta1","theta2","theta3"), lty=1, col=1:3, bty='n')