# Load required Python libraries.
import matplotlib.pyplot as plt
import numpy as np
from scipy import linalg

## Check is element of array is NaN
def check_nan(A):
    return np.sum(np.isnan(A))

## Calculate gradient with central finite difference
def my_grad_approx(x,f,eps1,verbose=False):
    '''
    Calculate gradient of function my_f using central difference formula
    
    Inputs:
        x - point for which to evaluate gradient
        f - function to consider
        eps1 - perturbation size
        
    Outputs:
        grad - gradient (vector)
    '''
    
    n = len(x)
    grad = np.zeros(n)
    
    if(verbose):
        print("***** my_grad_approx at x = ",x,"*****")
    
    for i in range(0,n):
        
        # Create vector of zeros except eps in position i
        e = np.zeros(n)
        e[i] = eps1
        
        # Finite difference formula
        my_f_plus = f(x + e)
        my_f_minus = f(x - e)
        
        # Diagnostics
        if(verbose):
            print("e[",i,"] = ",e)
            print("f(x + e[",i,"]) = ",my_f_plus)
            print("f(x - e[",i,"]) = ",my_f_minus)
        
        
        grad[i] = (my_f_plus - my_f_minus)/(2*eps1)
    
    if(verbose):
        print("***** Done. ***** \n")
    
    return grad

## Calculate gradient using central finite difference and my_hes_approx
def my_hes_approx(x,grad,eps2):
    '''
    Calculate Hessian of function my_f using central difference formula and my_grad
    
    Inputs:
        x - point for which to evaluate gradient
        grad - function to calculate the gradient
        eps2 - perturbation size (for Hessian NOT gradient approximation)
        
    Outputs:
        H - Hessian (matrix)
    '''
    
    n = len(x)
    H = np.zeros([n,n])
    
    for i in range(0,n):
        # Create vector of zeros except eps in position i
        e = np.zeros(n)
        e[i] = eps2
        
        # Evaluate gradient twice
        grad_plus = grad(x + e)
        grad_minus = grad(x - e)
        
        # Notice we are building the Hessian by column (or row)
        H[:,i] = (grad_plus - grad_minus)/(2*eps2)

    return H

## Linear algebra calculation
def xxT(u):
    '''
    Calculates u*u.T to circumvent limitation with SciPy
    
    Arguments:
    u - numpy 1D array
    
    Returns:
    u*u.T
    
    Assume u is a nx1 vector.
    Recall: NumPy does not distinguish between row or column vectors
    
    u.dot(u) returns a scalar. This functon returns an nxn matrix.
    '''
    
    n = len(u)
    A = np.zeros([n,n])
    for i in range(0,n):
        for j in range(0,n):
            A[i,j] = u[i]*u[j]
    
    return A

## Analyze Hessian
def analyze_hes(B):
    print(B,"\n")
    
    l = linalg.eigvals(B)
    print("Eigenvalues: ",l,"\n")


def unconstrained_newton(calc_f,calc_grad,calc_hes,x0,verbose=False,max_iter=50,
                         algorithm="newton",globalization="line-search", # specify algorithm
                         eps_dx=1E-6,eps_df=1E-6, # Convergence tolerances (all)
                         eta_ls=0.25,rho_ls=0.9,alpha_max=1.0, # line search parameters
                         delta_max_tr=10.0,delta_0_tr=2.0, # trust region parameters
                         kappa_1_tr = 0.25, kappa_2_tr = 0.75, gamma_tr=0.125 # trust region parameters
                        ):
    '''
    Newton-Type Methods for Unconstrained Nonlinear Continuous Optimization
    
    Arguments (required):
        calc_f : function f(x) to minimize [scalar]
        calc_grad: gradient of f(x) [vector]
        calc_hes: Hessian of f(x) [matrix]
    
    Arguments (options):
        algorithm : specify main algorithm.
            choices: "newton", "sr1", "bfgs", "steepest-descent"
        
        globalization : specify globalization strategy
            choices: "none", "line-search", "trust-region"
       
        eps_dx : tolerance for step size norm (convergence), eps1 in notes
        
        eps_df : tolerance for gradient norm (convergence), eps2 in notes
        
        eta_ls : parameter for Goldstein-Armijo conditions (line search only)
        
        rho_ls : parameter to shrink (backstep) line search
        
        alpha_max : initial step length scaling for line search and/or steepest-descent
    
        delta_max_tr : maximum trust region size
        
        delta_0_tr : initial trust region size
        
        kappa_1_tr : 'shrink' tolerance for trust region
        
        kappa_2_tr : 'expand' tolerance for trust region
        
        gamma_tr : 'accept step' tolerance for trust region
    
    Returns:
        x : iteration history for x (decision variables) [list of numpy arrays]
        f : iteration history for f(x) (objective values) [list of numpy arrays]
        p : iteration history for p (steps)
        B : iteration history for Hessian approximations [list of numpy arrays]
    '''
    
    # Allocate outputs as empty lists
    x = [] # decision variables
    f = [] # objective values
    p = [] # steps
    grad = [] # gradients
    alpha = [] # line search / steepest descent step scalar
    B = [] # Hessian approximation
    delta = [] # trust region size
    rho = [] # trust region actual/prediction ratio
    step_accepted = [] # step status for trust region. True means accepted. False means rejected.
    
    # Note: alpha, delta and rho will remain empty lists unless used in the algorithm below
    
    # Store starting point
    x.append(x0)
    k = 0
    
    flag = True
    
    # Print header for iteration information
    print("Iter. \tf(x) \t\t||grad(x)|| \t||p|| \t\tmin(eig(B)) \talpha \t\tdelta \t\tstep")
    
    while flag and k < max_iter:
        # Evaluate f(x) at current iteration
        f.append(calc_f(x[k]))
        
        # Evaluate gradient
        grad.append(calc_grad(x[k]))
        
        if(check_nan(grad[k])):
            print("WARNING: gradiant calculation returned NaN")
            break
        
        if verbose:
            print("\n")
            print("k = ",k)
            print("x = ",x[k])
            print("grad = ",grad[k])
            print("f = ",f[k])

        # Calculate exact Hessian or update approximation
        if(algorithm == "newton"):
            B.append(calc_hes(x[k]))
        
        elif k == 0 or algorithm == "steepest-descent":
            # Initialize or set to identity
            B.append(np.eye(len(x0)))
        
        elif algorithm == "sr1" or algorithm == "bfgs":
            # Change in x
            s = x[k] - x[k-1]

            # Change in gradient
            y = grad[k] - grad[k-1]

            if verbose:
                print("s = ",s)
                print("y = ",y)
            
            if algorithm == "sr1": # Calculate SR1 update
                dB = sr1_update(s, y, k, B, verbose)
                B.append(B[k-1] + dB)
                
            else: # Calculate BFGS update  
                dB = bfgs_update(s, y, k, B, verbose)
                B.append(B[k-1] + dB) 

        else:
            print("WARNING. algorithm = ",algorithm," is not supported.")
            break

        if verbose:
            print("B = \n",B[k])
            print("B[k].shape = ",B[k].shape)
            
        if(check_nan(B[k])):
            print("WARNING: Hessian update returned NaN")
            break
            
        c = np.linalg.cond(B[k])
        if c > 1E12:
            flag = False
            print("Warning: Hessian approximation is near singular.")
            print("B[k] = \n",B[k])
        
        else:
            
            # Solve linear system to calculate step
            pk = linalg.solve(B[k],-grad[k])
            
            if globalization == "none":
                if algorithm == "steepest-descent":
                    # Save step and scale by alpha_max
                    p.append(pk*alpha_max)
                    
                else:
                    # Take full step
                    p.append(pk)
                    
                # Apply step and calculate x[k+1]
                x.append(x[k] + p[k])
                    
            elif globalization == "line-search":
                
                # Line Search Function
                update, alphak = line_search(x, f, grad, calc_f, pk, k, alpha_max, eta_ls, rho_ls, verbose)
                
                # Now the line search is complete, apply final value of alphak
                p.append(update)
                
                # Save alpha
                alpha.append(alphak)
                
                # Apply step and calculate x[k+1]
                x.append(x[k] + p[k])
            
            elif globalization == "trust-region":
                
                # Trust region function
                update = trust_region(x, grad, B, delta, k, pk, delta_0_tr, verbose)
                p.append(update)
                
                ### Trust region management

                # Actual reduction
                ared = f[k] - calc_f(x[k] + p[k])

                # Predicted reduction
                pred = -(grad[k].dot(p[k]) + 0.5*p[k].dot(B[k].dot(p[k])))

                # Calculate rho
                if ared == 0 and pred == 0:
                    # This occurs is the gradient is zero and Hessian is P.D.
                    rho.append(1E4)
                else:
                    rho.append(ared/pred)

                if(verbose):
                    print("f[k] = ",f[k])
                    print("p[k] = ",p[k])
                    print("f(x[k] + p[k]) = ",calc_f(x[k] + p[k]))
                    print("ared = ",ared)
                    print("pred = ",pred)
                    print("rho = ",rho[k])

                ## Check trust region shrink/expand logic
                if rho[k] < kappa_1_tr:
                    # Shrink trust region
                    delta.append(kappa_1_tr*linalg.norm(p[k]))

                elif rho[k] > kappa_2_tr and np.abs(linalg.norm(p[k]) - delta[k]) < 1E-6:
                    # Expand trust region
                    delta.append(np.min([2*delta[k], delta_max_tr]))

                else:
                    # Keep trust region same size
                    delta.append(delta[k])

                # Compute step
                if rho[k] > gamma_tr:
                    # Take step
                    x.append(x[k] + p[k])
                    step_accepted.append(True)
                else:
                    # Skip step
                    x.append(x[k])
                    step_accepted.append(False)
            else:
                print("Warning: globalization strategy not supported")

            if verbose:
                print("p = ",p[k])

            # Calculate norms
            norm_p = linalg.norm(p[k])
            norm_g = linalg.norm(grad[k])

            # Calculate eigenvalues of Hessian (for display only)
            min_ev = np.min(np.real(linalg.eigvals(B[k])))

            '''
            print("f[k] = ",f[k])
            print("norm_g = ",norm_g)
            print("norm_g = ",norm_p)
            print("min_ev = ",min_ev)
            '''
            
            print(k,'  \t{0: 1.4e} \t{1:1.4e} \t{2:1.4e} \t{3: 1.4e}'.format(f[k],norm_g,norm_p,min_ev),end='')

            # Python tip. The expression 'if my_list' is false if 'my_list' is empty
            if not alpha:
                # alpha is an empty list
                print(' \t -------',end='')
            else:
                # otherwise print value of alpha
                print(' \t{0: 1.2e}'.format(alpha[k]),end='')
            
            if not delta:
                # delta is an empty list
                print(' \t -------',end='')
            else:
                # otherwise print value of alpha
                print(' \t{0: 1.2e}'.format(delta[k]),end='')
                
            if not step_accepted:
                print(' \t -----')
            else:
                if step_accepted[k]:
                    print(' \t accept')
                else:
                    print(' \t reject')
            
            # Check convergence criteria.
            flag = (norm_p > eps_dx) and (norm_g > eps_df)

            # Update iteration counter
            k = k + 1
    
    if(k == max_iter and flag):
        print("Reached maximum number of iterations.")
    
    print("x* = ",x[-1])
    
    return x,f,p,B


def sr1_update(s, y, k, B, verbose):
    """ 
    Function that implements the sr1 optimization algorithm
    
    Inputs:
    s : Change in x
    y : Change in gradient
    k : Iteration number
    B : Hessian approximation
    verbose : toggles verbose output (True or False)
    
    Outputs:
    dB : Update to Hessian approximation
    """
    
    # SR1 formulation
    u = y - B[k-1].dot(s)
    denom = (u).dot(s)

    # Formula: dB = u * u.T / (u.T * s) if u is a column vector.
    if abs(denom) <= 1E-10:
        # Skip update
        dB = 0
    else:
        # Calculate update
        dB = xxT(u)/denom

    if(verbose):
        print("SR1 update denominator, (y-B[k-1]*s).T*s = ",denom)
        print("SR1 update u = ",u)
        print("SR1 update u.T*u/(u.T*s) = \n",dB)
    
    return dB #return the update to the Hessian approximation


def bfgs_update(s, y, k, B, verbose):
    """ 
    Function that implements the BFGS optimization algorithm
    
    Inputs:
    s : Change in x
    y : Change in gradient
    k : Iteration number
    B : Hessian approximation
    verbose : toggles verbose output (True or False)
    
    Outputs:
    dB : Update to Hessian approximation
    """
    
    # Define constant used to check norm of the update
    # See Eq. (3.19) in Biegler (2010)
    C2 = 1E-4
    
    # YOUR SOLUTION HERE
        
    return dB #return the update to the Hessian approximation


def line_search(x, f, grad, calc_f, pk, k, alpha_max, eta_ls, rho_ls, verbose):
    """
    Function that implements the line search globalization strategy
    
    Inputs:
    x : decision variables
    f : objective values
    grad : gradients
    calc_f : function f(x) to minimize [scalar]
    pk : step
    k - Iteration number
    alpha_max : initial step length scaling for line search and/or steepest-descent
    eta_ls : parameter for Goldstein-Armijo conditions (line search only)
    rho_ls : parameter to shrink (backstep) line search
    verbose : toggles verbose output (True or False)
    
    Outputs:
    update : update to p
    alphak : update to alpha
    """
    
    # Flag - continue line search?
    ls = True

    ## Initialize alpha
    alphak = alpha_max

    if(verbose):
        print("\t LINE SEARCH")

    while ls:

        # Calculate test point (if step accepted)
        xtest = x[k] + alphak*pk

        # Evaluate f(x) at test point. This is used for Armijo and Goldstein conditions
        ftest = calc_f(xtest)
        
        # YOUR SOLUTION HERE

    # Now the line search is complete, apply final value of alphak
    update = alphak*pk
    
    return update, alphak


def trust_region(x, grad, B, delta, k, pk, delta_0_tr, verbose):
    """ 
    Function that implements the trust region globalization strategy
    
    Inputs:
    x : decision variables
    grad : gradients
    B : Hessian approximation
    delta : trust region size
    k : Iteration number
    pk : step
    delta_0_tr : initial trust region size
    verbose : toggles verbose output (True or False)
    
    Outputs:
    update : update to p
    """
    
    ### Initialize trust region radius
    if(k == 0):
        delta.append(delta_0_tr)

    grad_zero = (linalg.norm(grad[k]) < 1E-14)

    ### Powell dogleg step

    # Calculate Cauchy step (pC)
    denom = grad[k].dot(B[k].dot(grad[k]))
    if verbose:
        print("TR: Cauchy step. grad.T*B*grad = ",denom)

    if denom > 0:
        # Term in ( ) is a scalar
        pC = -(grad[k].dot(grad[k])/denom)*grad[k]
    elif grad_zero:
        pC = np.zeros(len(x[k]))
    else:
        pC = - delta[k]/linalg.norm(grad[k])*grad[k]

    # Use Newton step (calculate above)
    pN = pk

    # Determine step
    if linalg.norm(pN) <= delta[k]:
        # Take Newton step. pN is inside the trust region.
        update = pN
        
    # YOUR SOLUTION HERE
    
    return update


def check_sln(x,calc_hes):
    Hval = calc_hes(x)
    l, v = linalg.eig(Hval)
    print("Eigenvalues of Hessian at x* = ",l)

def benchmark(x0,calc_f,calc_grad,calc_hes,verbose,cases=range(0,10)):
    '''
    Test 10 algorithm cases for a single starting point
    
    Arguments:
        x0 - starting point
        calc_f - function to evaluate objective
        calc_grad - function to evaluate gradient
        calc_hes - function to evaluate Hessian
        verbose - toggles verbose output (True or False)
        cases - list of cases to consider. Especially helpful for debugging.
            For example, setting cases=[1, 2] runs only case 2 and 3 (recall, Python starting counting at 0)
    '''
    
    for i in cases:
        if i == 0:
            print("***Case 1: Newton method with exact Hessian and no globalization strategy***")
            x1,f,p,B = unconstrained_newton(calc_f,calc_grad,calc_hes,x0,verbose,
                                            algorithm="newton",globalization="none")
            check_sln(x1[-1],calc_hes)
            print("\n")
    
        if i == 1:
            print("***Case 2: Newton method with exact Hessian and line search***")
            x2,f,p,B = unconstrained_newton(calc_f,calc_grad,calc_hes,x0,verbose,
                                            algorithm="newton",globalization="line-search")
            check_sln(x2[-1],calc_hes)
            print("\n")
            
        if i == 2:
            print("***Case 3: Newton method with exact Hessian and trust region***")
            x3,f,p,B = unconstrained_newton(calc_f,calc_grad,calc_hes,x0,verbose,
                                            algorithm="newton",globalization="trust-region")
            check_sln(x3[-1],calc_hes)
            print("\n")
            
        if i == 3:
            print("***Case 4: Quasi-Newton method with SR1 Hessian approximation and no globalization strategy***")
            x4,f,p,B = unconstrained_newton(calc_f,calc_grad,calc_hes,x0,verbose,
                                            algorithm="sr1",globalization="none")
            check_sln(x4[-1],calc_hes)
            print("\n")
            
        if i == 4:
            print("***Case 5: Quasi-Newton method with SR1 Hessian approximation and line search***")
            x5,f,p,B = unconstrained_newton(calc_f,calc_grad,calc_hes,x0,verbose,
                                            algorithm="sr1",globalization="line-search")
            check_sln(x5[-1],calc_hes)
            print("\n")
            
        if i == 5:
            print("***Case 6: Quasi-Newton method with SR1 Hessian approximation and trust region***")
            x6,f,p,B = unconstrained_newton(calc_f,calc_grad,calc_hes,x0,verbose,
                                            algorithm="sr1",globalization="trust-region")
            check_sln(x6[-1],calc_hes)
            print("\n")

        if i == 6:
            print("***Case 7: Quasi-Newton method with BFGS Hessian approximation and no globalization strategy***")
            x7,f,p,B = unconstrained_newton(calc_f,calc_grad,calc_hes,x0,verbose,
                                            algorithm="bfgs",globalization="none")
            check_sln(x7[-1],calc_hes)
            print("\n")
            
        if i == 7:
            print("***Case 8: Quasi-Newton method with BFGS Hessian approximation and line search***")
            x8,f,p,B = unconstrained_newton(calc_f,calc_grad,calc_hes,x0,verbose,
                                            algorithm="bfgs",globalization="line-search")
            check_sln(x8[-1],calc_hes)
            print("\n")

        if i == 8:
            print("***Case 9: Quasi-Newton method with BFGS Hessian approximation and trust region***")
            x9,f,p,B = unconstrained_newton(calc_f,calc_grad,calc_hes,x0,verbose,
                                            algorithm="bfgs",globalization="trust-region")
            check_sln(x9[-1],calc_hes)
            print("\n")

        if i == 9:
            print("***Case 10: Steepest Descent and line search***")
            x10,f,p,B = unconstrained_newton(calc_f,calc_grad,calc_hes,x0,verbose,
                                             algorithm="steepest-descent",globalization="line-search")
            check_sln(x10[-1],calc_hes)
            print("\n")


def my_f(x):
    return x[0]**2 + (x[1]-1)**2

def my_grad(x):
    return np.array([2*x[0], 2*(x[1] - 1) ])

def my_hes(x):
    return 2*np.eye(2)


x0 = np.array([-3.0,2.0])
benchmark(x0,my_f,my_grad,my_hes,False)


## Define f(x)
f_ = lambda x : 0.5*(x[0]-1)**4 + (x[0]+1)**3 - 10*x[0]**2 + 5*x[0]

## Define f'(x)
df_ = lambda x : (6 - 8*x[0] - 3*x[0]**2 + 2*x[0]**3)*np.ones(1)

## Define f''(x)
ddf_ = lambda x : (-8 - 6*x[0] + 6*x[0]**2)*np.ones((1,1))


x0 = np.ones(1)*(-3.0)
benchmark(x0,f_,df_,ddf_,False)


x0 = np.zeros(1)
benchmark(x0,f_,df_,ddf_,False)


def asqrt(z):
    '''
    Approximate/smoothed square root
    '''

    return np.sqrt(0.5*(np.sqrt(z**2 + 1E-4) + z))
    
def ex2_19_smoothed(x,verbose=False):
    ''' Evaluate function given above at point x

    Inputs:
        x - vector with 2 elements
        
    Outputs:
        f - function value (scalar)
    '''
    # Constants
    a = np.array([0.3, 0.6, 0.2])
    b = np.array([5, 26, 3])
    c = np.array([40, 1, 10])
    
    # Intermediates. Recall Python indicies start at 0
    u = x[0] - 0.8
    s = asqrt(1-u)
    s2 = asqrt(1+u)
    v = x[1] -(a[0] + a[1]*u**2*s-a[2]*u)
    alpha = -b[0] + b[1]*u**2*s2+ b[2]*u
    beta = c[0]*v**2*(1-c[1]*v)/(1+c[2]*u**2)
    
    f = alpha*np.exp(-beta)
    
    if verbose:
        print("##### my_f at x = ",x, "#####")
        print("u = ",u)
        print("sqrt(1-u) = ",s)
        print("sqrt(1+u) = ",s2)
        print("v = ",v)
        print("alpha = ",alpha)
        print("beta = ",beta)
        print("f(x) = ",f)
        print("##### Done. #####\n")
    
    return f

my_f2 = lambda x : ex2_19_smoothed(x,verbose=False)

my_grad2 = lambda x : my_grad_approx(x,my_f2,1E-6,verbose=False)

my_hes2 = lambda x : my_hes_approx(x,my_grad2,1E-6)


x0 = np.array([0.7, 0.3])
benchmark(x0,my_f2,my_grad2,my_hes2,False)


x0 = np.array([0.0, 0.0])
benchmark(x0,my_f2,my_grad2,my_hes2,False)

3.9 Algorithms Homework 3¶

3.9.1 Pseudocode¶

3.9.2 Unconstrained NLP Algorithm in Python¶

3.9.2.1 Library of Helper Functions¶

3.9.2.2 Main Function¶

3.9.2.3 Feature Status¶

SR1¶

BFGS¶

Line Search¶

Trust Region¶

3.9.3 Benchmark Tests¶

3.9.3.1 Quadratic Test Problem¶

3.9.3.1.1 Benchmark Cases¶

3.9.3.1.2 Discussion and Analysis¶

3.9.3.2 One-Dimensional Example¶

3.9.3.2.1 Benchmark Cases with $x_0 = -3$¶

3.9.3.2.2 Discussion¶

3.9.3.3 Benchmark Case with $x_0 = 0$¶

3.9.3.3.1 Discussion¶

3.9.3.4 Return of Example 2.19¶

3.9.3.4.1 Benchmark with $x_0$ Near Solution¶

3.9.3.4.2 Discussion¶

3.9.3.4.3 Benchmark with $x_0$ Far From the Solution¶

3.9.3.4.4 Discussion¶