Inertia-Corrected Netwon Method for Equality Constrained NLPs

7.8. Inertia-Corrected Netwon Method for Equality Constrained NLPs#

Background:

Eqn5-12

The Algorithm:

Alg5-2a

Alg5-2b

7.8.1. Helper Functions#

# Load required Python libraries.
import matplotlib.pyplot as plt
import numpy as np
from scipy import linalg

## Check is element of array is NaN
def check_nan(A):
    return np.sum(np.isnan(A))

## Calculate gradient with central finite difference
## Calculate gradient with central finite difference
def my_grad_approx(x,f,eps1,verbose=False):
    '''
    Calculate gradient of function f using central difference formula
    
    Inputs:
        x - point for which to evaluate gradient
        f - function to consider
        eps1 - perturbation size
        
    Outputs:
        grad - gradient (vector)
    '''
    
    n = len(x)
    grad = np.zeros(n)
    
    if(verbose):
        print("***** my_grad_approx at x = ",x,"*****")
    
    for i in range(0,n):
        
        # Create vector of zeros except eps in position i
        e = np.zeros(n)
        e[i] = eps1
        
        # Finite difference formula
        my_f_plus = f(x + e)
        my_f_minus = f(x - e)
        
        # Diagnostics
        if(verbose):
            print("e[",i,"] = ",e)
            print("f(x + e[",i,"]) = ",my_f_plus)
            print("f(x - e[",i,"]) = ",my_f_minus)
        
        
        grad[i] = (my_f_plus - my_f_minus)/(2*eps1)
    
    if(verbose):
        print("***** Done. ***** \n")
    
    return grad

def my_jac_approx(x,h,eps1,verbose=False):
    '''
    Calculate Jacobian of function h(x) using central difference formula
    
    Inputs:
        x - point for which to evaluate gradient
        h - vector-valued function to consider. h(x): R^n --> R^m
        eps1 - perturbation size
        
    Outputs:
        A - Jacobian (n x m matrix)
    '''
    
    # Check h(x) at x
    h_x0 = h(x)
    
    # Extract dimensions
    n = len(x)
    m = len(h_x0)
    
    # Initialize Jacobian matrix
    A = np.zeros((n,m))
    
    # Calculate Jacobian by row
    for i in range(0,n):
        
        # Create vector of zeros except eps in position i
        e = np.zeros(n)
        e[i] = eps1
        
        # Finite difference formula
        my_h_plus = h(x + e)
        my_h_minus = h(x - e)
        
        # Diagnostics
        if(verbose):
            print("e[",i,"] = ",e)
            print("h(x + e[",i,"]) = ",my_h_plus)
            print("h(x - e[",i,"]) = ",my_h_minus)
        
        
        A[i,:] = (my_h_plus - my_h_minus)/(2*eps1)
    
    if(verbose):
        print("***** Done. ***** \n")
    
    return A
    
## Calculate gradient using central finite difference and my_hes_approx
def my_hes_approx(x,grad,eps2):
    '''
    Calculate gradient of function my_f using central difference formula and my_grad
    
    Inputs:
        x - point for which to evaluate gradient
        grad - function to calculate the gradient
        eps2 - perturbation size (for Hessian NOT gradient approximation)
        
    Outputs:
        H - Hessian (matrix)
    '''
    
    n = len(x)
    H = np.zeros([n,n])
    
    for i in range(0,n):
        # Create vector of zeros except eps in position i
        e = np.zeros(n)
        e[i] = eps2
        
        # Evaluate gradient twice
        grad_plus = grad(x + e)
        grad_minus = grad(x - e)
        
        # Notice we are building the Hessian by column (or row)
        H[:,i] = (grad_plus - grad_minus)/(2*eps2)

    return H

## Linear algebra calculation
def xxT(u):
    '''
    Calculates u*u.T to circumvent limitation with SciPy
    
    Arguments:
    u - numpy 1D array
    
    Returns:
    u*u.T
    
    Assume u is a nx1 vector.
    Recall: NumPy does not distinguish between row or column vectors
    
    u.dot(u) returns a scalar. This functon returns an nxn matrix.
    '''
    
    n = len(u)
    A = np.zeros([n,n])
    for i in range(0,n):
        for j in range(0,n):
            A[i,j] = u[i]*u[j]
    
    return A

## Analyze Hessian
def analyze_hes(B):
    print(B,"\n")
    
    l = linalg.eigvals(B)
    print("Eigenvalues: ",l,"\n")

7.8.2. Algorithm 5.2#

def assemble_check_KKT(W,A,deltaA,deltaW,verbose):
    
    n = np.size(W,0)
    
    if(np.size(W,1) != n):
        print("WARNING: W is not square. Somthing is broken.")
        
    rA = np.size(A,0)
    m = np.size(A,1)
    
    if(rA != n):
        print("WARNING: A does not have the corrent number of rows.")
    
    # Assemble KKT matrix
    KKT_top = np.concatenate((W + deltaW*np.eye(n),A),axis=1)
    KKT_bot = np.concatenate((A.T,-deltaA*np.eye(m)),axis=1)
    KKT = np.concatenate((KKT_top,KKT_bot),axis=0)
        
    # Check inertia of KKT matrix.
    # Out of simplicity, we will just calculate the eigenvalues.
    # Biegler, 2010 explains a more sophisticates (and computationally efficient)
    # strategy
    l, eigvec = linalg.eig(KKT)

    zero_tol = 1E-12

    # Count number of positive eigenvalues
    pos_ev = sum(l >= zero_tol)

    # Count number of eigenvalues close to zero
    zero_ev = sum(np.abs(l) < zero_tol)

    # Count number of negative eigenvalues
    neg_ev = sum(l <= -zero_tol)

    inertia_correct = (n == pos_ev and m == neg_ev)
    
    if(verbose):
        print("deltaA = ",deltaA,"  deltaW = ",deltaW,"intertia =(",pos_ev,",",neg_ev,",",zero_ev,")\n")
    
    return KKT,inertia_correct,pos_ev,neg_ev,zero_ev
    
def alg52(x0,calc_f,calc_h,eps1=1E-6,eps2=1E-6,max_iter=10,verbose=False):
    '''
    Basic Full Space Newton Method for Equality Constrained NLP
    
    Input:
        x0 - starting point (vector)
        calc_f - function to calculate objective (returns scalar)
        calc_h - function to calculate constraints (returns vector)
        eps1 - tolerance for primal and dual steps
        eps2 - tolerance for gradient of L1
        
    Outputs:
        x - history of steps (primal variables)
        v - history of steps (dual variables)
        f - history of objective evaluations
        h - history of constraint evaluations
        df - history of objective gradients
        dL - history of Lagrange function gradients
        A - history of constraint Jacobians
        W - history of Lagrange Hessians
    
    Notes:
        1. For simplicity, central finite difference is used 
           for all gradient calculations.
    '''

    ### Specifics for Algorithm 5.2
    # Tuning parameters
    delta_bar_W_min = 1E-20
    delta_bar_W_0 = 1E-4
    delta_bar_W_max = 1E40
    delta_bar_A = 1E-8
    kappa_u = 8
    kappa_l = 1/3
    
    # Declare iteration histories as empty lists
    x = []
    v = []
    f = []
    L = []
    h = []
    df = []
    dL = []
    A = []
    W = []

    # Set for first iteration
    delta_A = 0.0
    delta_W = 0.0
    delta_W_last = 0.0
    
    # Flag for iterations
    flag = True
    
    # Iteration counter
    k = 0
    
    # Copy initial point to primal variable history
    n = len(x0)
    x.append(x0)
    
    # Evaluate objective and constraints at initial point
    f.append(calc_f(x0))
    h.append(calc_h(x0))
    
    # Determine number of equality constraints
    m = len(h[0])
    
    # Initial dual variables with vector of ones
    v.append(np.ones(m))
    
    # Print header for iteration information
    print("Iter. \tf(x) \t\t||h(x)|| \t||grad_L(x)|| \t||dx|| \t\t||dv|| \t\tdelta_A \tdelta_W")
    
    while(flag and k < max_iter):
        
        # STEP 1. Construct KKT matrix
        
        if(k > 0):
            # Evaluate objective function
            f.append(calc_f(x[k]))
            
            # Evaluate constraint function
            h.append(calc_h(x[k]))
        
        # Evaluate objective gradient
        df.append(my_grad_approx(x[k],calc_f,1E-6))
        
        # Evaluate constraint Jacobian
        A.append(my_jac_approx(x[k],calc_h,1E-6))
        
        # Evaluate gradient of Lagrange function
        L_func = lambda x_ : calc_f(x_) + (calc_h(x_)).dot(v[k])
        L_grad = lambda x_ : my_grad_approx(x_,L_func,1E-6)
        dL.append(L_grad(x[k]))
        norm_dL = linalg.norm(dL[k])
        
        # Evaluate Hessian of Lagrange function
        W.append(my_hes_approx(x[k],L_grad,1E-6))
        
        if(verbose):
            print("*** k =",k," ***")
            print("x_k =",x[k])
            print("v_k =",v[k])
            print("f_k =",f[k])
            print("df_k =",df[k])
            print("h_k =",h[k])
            print("A_k =\n",A[k])
            print("W_k =\n",W[k])
            print("\n")
        
        ######
        # Algorithm 5.2: Add intertia correction to Algorithm 5.1
                
        ### ALG. 5.2, STEP 1
        delta_A = 0
        delta_W = 0
        
        KKT,inertia_correct,pos_ev,neg_ev,zero_ev = assemble_check_KKT(W[k],A[k],delta_A,delta_W,verbose)
        
        if not inertia_correct:
            ### ALG. 5.2, STEP 2
            # Assume zero eigenvalue is from rank-deficient Jacobian.
            # Set delta_A to non-zero value
            if zero_ev != 0:
                delta_A = delta_bar_A
            
            ### ALG. 5.2, STEP 3
            if delta_W_last == 0:
                delta_W = delta_bar_W_0
            else:
                delta_W = max(delta_bar_W_min,kappa_l*delta_W_last)

        while (not inertia_correct) and flag:
            ### ALG. 5.2, STEP 4
            KKT,inertia_correct,pos_ev,neg_ev,zero_ev = assemble_check_KKT(W[k],A[k],delta_A,delta_W,verbose)
            
            ### ALG. 5.2, STEP 5
            if not inertia_correct:
                delta_W = kappa_u*delta_W
                
            ### ALG. 5.2, STEP 6
            if delta_W > delta_bar_W_max:
                flag = False
                print("Abort step computation. KKT matrix is severely ill-conditioned.")
            
        delta_W_last = delta_W
                
        ## STEP 2. Solve linear system.
        
        if flag:
            b = -np.concatenate((dL[k],h[k]),axis=0)
            z = linalg.solve(KKT,b)
        else:
            z = []
            
        ## STEP 3. Take step
        if flag:
            dx = z[0:n]
            dv = z[n:(n+m+1)]
            
            x.append(x[k] + dx)
            v.append(v[k] + dv)
            
            norm_dx = linalg.norm(dx)
            norm_dv = linalg.norm(dv)
        
        ## Print iteration information
        print(k,'  \t{0: 1.4e} \t{1:1.4e} \t{2:1.4e}'.format(f[k],linalg.norm(h[k]),norm_dL),end='')
        
        if flag:
            print(' \t{0: 1.4e} \t{1: 1.4e}'.format(norm_dx,norm_dv),end='')
        else:
            print(' \t -------  \t -------',end='')
        
        print(' \t{0: 1.4e} \t{1: 1.4e}'.format(delta_A,delta_W),end='\n')
        
        # Increment counter
        k = k + 1
        
        ## Check convergence criteria
        if(flag):
            flag = norm_dx > eps1 and norm_dv > eps1 and norm_dL > eps2
    
    if(not flag and k > max_iter):
        print("Reached maximum number of iterations.")
    
    return x,v,f,h,df,dL,A,W

7.8.3. Example Problem 2#

(Where is Example Problem 1? We are using the same number scheme as the previous notebook.)

Consider: $$\begin{align}\min_x \quad & x_1^2 + 2 x_2^2 \\ \mathrm{s.t.} \quad & x_1 + x_2 = 1 \\ & x_1 + x_2 = 1 \end{align}$$

7.8.3.1. Test Algorithm 5.2 with redundant constraints.#

## Define functions
def my_f2(x):
    return x[0]**2 + 2*x[1]**2

def my_h2(x):
    h = np.zeros(2)
    h[0] = x[0] + x[1] - 1
    h[1] = h[0]
    return h

## Declare initial point
x0 = np.array((1,1))

## Run Algorithm 5.2 on test problem
results = alg52(x0,my_f2,my_h2,verbose=False)

## Display results
xstar = results[0][-1]
print("\nx* =",xstar)

## Display results
vstar = results[1][-1]
print("\nv* =",vstar)

Iter. 	f(x) 		||h(x)|| 	||grad_L(x)|| 	||dx|| 		||dv|| 		delta_A 	delta_W
0   	 3.0000e+00 	1.4142e+00 	7.2111e+00 	 7.4537e-01 	 2.3571e+00 	 1.0000e-08 	 1.0000e-04
1   	 6.6667e-01 	2.3373e-08 	2.9368e-04 	 5.4908e-05 	 9.4088e-05 	 1.0000e-08 	 3.3333e-05
2   	 6.6667e-01 	9.4991e-13 	3.7259e-09 	 1.1908e-09 	 6.4780e-11 	 1.0000e-08 	 1.1111e-05

x* = [0.66666667 0.33333333]

v* = [-0.66666666 -0.66666667]

7.8.3.2. Test Algorithm 5.2 without redundant constraints.#

## Define functions
def my_h2b(x):
    return (x[0] + x[1] - 1)*np.ones(1)

## Run Algorithm 5.2 on test problem
results = alg52(x0,my_f2,my_h2b,verbose=False)

## Display results
xstar = results[0][-1]
print("\nx* =",xstar)

## Display results
vstar = results[1][-1]
print("\nv* =",vstar)

Iter. 	f(x) 		||h(x)|| 	||grad_L(x)|| 	||dx|| 		||dv|| 		delta_A 	delta_W
0   	 3.0000e+00 	1.0000e+00 	5.8310e+00 	 7.4537e-01 	 2.3335e+00 	 0.0000e+00 	 0.0000e+00
1   	 6.6667e-01 	1.3978e-10 	2.9473e-04 	 4.5326e-05 	 1.5285e-04 	 0.0000e+00 	 0.0000e+00
2   	 6.6667e-01 	0.0000e+00 	4.5431e-09 	 1.4393e-09 	 2.0170e-09 	 0.0000e+00 	 0.0000e+00

x* = [0.66666667 0.33333333]

v* = [-1.33333333]

7.8.3.3. Discussion#

Compare the multipliers with and without the redundant constraint.

7.8.4. Example Problem 3#

Let us try a model where $\nabla h(x^k)^T$ is full rank but there are multiple local optima.

Consider: $$\begin{align}\min_x \quad & x_1^3 - x_2 -x_1 x_2 - x_2^2 \\ \mathrm{s.t.} \quad & x_1^2 + x_2^2 = 1 \end{align}$$

## Copied from notebook with Algorithm 5.1
def my_f3(x):
    return x[0]**3 - x[1] - x[0]*x[1] - x[1]**2

def my_h3(x):
    return (x[0]**2 + x[1]**2 - 1)*np.ones(1)

def visualize(xk=[]):
    n1 = 101
    n2 = 101
    x1eval = np.linspace(-2,2,n1)
    x2eval = np.linspace(-2,2,n2)
    
    X, Y = np.meshgrid(x1eval, x2eval)
    
    Z = np.zeros([n2,n1])
    
    for i in range(0,n1):
        for j in range(0,n2):
            Z[j,i] = my_f3((X[j,i], Y[j,i]))
            
    fig, ax = plt.subplots(1,1)
    CS = ax.contour(X, Y, Z)
    ax.clabel(CS, inline=1, fontsize=12)

    # Add grid
    plt.grid()
    
    # Add unit circle
    circ = plt.Circle((0, 0), radius=1, edgecolor='b', facecolor='None')
    ax.add_patch(circ)
    
    # Plot iteration history
    if len(xk) > 0:
        for i in range(0,len(xk)):
            if i == len(xk) - 1:
                c = "red"
            else:
                c = "black"
            plt.scatter((xk[i][0]),(xk[i][1]),marker='o',color=c)
    
    plt.xlim([-2,2])
    plt.ylim([-2,2])
    
visualize()

nt = 200
theta = np.linspace(0,2*np.pi,nt)
obj = np.zeros(nt)


for i in range(0,nt):
    x_ = np.cos(theta[i])
    y_ = np.sin(theta[i])
    
    obj[i] = my_f3((x_,y_))


plt.figure()
plt.plot(theta,obj)
plt.xlabel('$\\theta$ [radians]')
plt.ylabel('Objective')
plt.grid()

../../_images/2301140fc766bc439398e86102fd3a0c5ff8876a8abac4391b101eddc9015bbe.png

../../_images/d29a370fff8ad2900fbed46c839e37102fa6ea1d0015f104e1d43e676039576b.png

7.8.4.1. Starting Point Near Global Min ($\theta_0 = 1.0$)#

theta0 = 1.0
x0 = np.array((np.sin(theta0),np.cos(theta0)))

## Run Algorithm 5.1 on test problem
results = alg52(x0,my_f3,my_h3)

## Display results
xstar = results[0][-1]
print("\nx* =",xstar)

## Display results
vstar = results[1][-1]
print("\nv* =",vstar)

## Convert into theta
print("\ntheta* =",np.arccos(xstar[0]),"=",np.arcsin(xstar[1]))

## Visualize
visualize(results[0])

Iter. 	f(x) 		||h(x)|| 	||grad_L(x)|| 	||dx|| 		||dv|| 		delta_A 	delta_W
0   	-6.9105e-01 	0.0000e+00 	3.7501e+00 	 1.1171e+00 	 1.1455e+00 	 0.0000e+00 	 0.0000e+00
1   	-4.0104e+00 	1.2480e+00 	2.1729e+00 	 4.2318e-01 	 3.9672e-01 	 0.0000e+00 	 0.0000e+00
2   	-2.4216e+00 	1.7908e-01 	3.3575e-01 	 8.3649e-02 	 1.0205e-01 	 0.0000e+00 	 0.0000e+00
3   	-2.1438e+00 	6.9971e-03 	1.7074e-02 	 3.5294e-03 	 6.5710e-03 	 0.0000e+00 	 0.0000e+00
4   	-2.1324e+00 	1.2457e-05 	4.6327e-05 	 6.3597e-06 	 1.9707e-05 	 0.0000e+00 	 0.0000e+00
5   	-2.1323e+00 	4.0446e-11 	1.4043e-09 	 2.8044e-10 	 3.3891e-11 	 0.0000e+00 	 0.0000e+00

x* = [0.24215301 0.97023807]

v* = [1.64012795]

theta* = 1.326212035697004 = 1.3262120356970035

../../_images/6c5c2a29b7b732d3f5a3fdb75664afe3402a1f9294774ecfd9c6f911843ed068.png

7.8.4.2. Starting Point Near Local Min ($\theta_0 = \pi$)#

theta0 = np.pi
x0 = np.array((np.sin(theta0),np.cos(theta0)))

## Run Algorithm 5.1 on test problem
results = alg52(x0,my_f3,my_h3)

## Display results
xstar = results[0][-1]
print("\nx* =",xstar)

## Display results
vstar = results[1][-1]
print("\nv* =",vstar)

## Convert into theta
print("\ntheta* =",np.arccos(xstar[0]),"(using arccos)")
print("\ntheta* =",np.arcsin(xstar[1]),"(using arcsin)")

## Visualize
visualize(results[0])

Iter. 	f(x) 		||h(x)|| 	||grad_L(x)|| 	||dx|| 		||dv|| 		delta_A 	delta_W
0   	 2.2204e-16 	0.0000e+00 	1.4142e+00 	 5.0002e-01 	 2.4998e-01 	 0.0000e+00 	 0.0000e+00
1   	-6.2504e-01 	2.5002e-01 	1.0000e+00 	 3.0322e-01 	 4.1490e-01 	 0.0000e+00 	 3.2768e+00
2   	-7.1455e-01 	9.1941e-02 	8.2867e-01 	 6.9738e+00 	 3.9656e+00 	 0.0000e+00 	 0.0000e+00
3   	-1.9540e+02 	4.8634e+01 	5.3609e+01 	 7.3453e+00 	 8.9486e+00 	 0.0000e+00 	 3.2768e+00
4   	-6.6453e+01 	5.3954e+01 	7.3649e+01 	 3.6845e+00 	 4.7204e+00 	 0.0000e+00 	 8.7381e+00
5   	-2.0079e+01 	1.3575e+01 	2.6604e+00 	 1.7791e+00 	 2.6030e-01 	 0.0000e+00 	 0.0000e+00
6   	-6.7442e+00 	3.1651e+00 	9.2895e-01 	 7.7547e-01 	 2.3764e-01 	 0.0000e+00 	 0.0000e+00
7   	-3.0811e+00 	6.0135e-01 	3.5585e-01 	 2.3833e-01 	 1.8564e-01 	 0.0000e+00 	 0.0000e+00
8   	-2.2251e+00 	5.6802e-02 	8.3998e-02 	 2.7726e-02 	 5.2689e-02 	 0.0000e+00 	 0.0000e+00
9   	-2.1336e+00 	7.6871e-04 	2.8474e-03 	 3.8422e-04 	 1.6169e-03 	 0.0000e+00 	 0.0000e+00

x* = [0.24215303 0.97023814]

v* = [1.64012726]

theta* = 1.3262120118414 (using arccos)

theta* = 1.3262123260037786 (using arcsin)

../../_images/6becc963dd9cade1dfcbcda4eb0989483686df629b93458ecfd91645888e6f66.png

7.8.4.3. Starting Point Near Global Max ($\theta_0 = 5.5$)#

theta0 = 5.5
x0 = np.array((np.sin(theta0),np.cos(theta0)))

## Run Algorithm 5.1 on test problem
results = alg52(x0,my_f3,my_h3)

## Display results
xstar = results[0][-1]
print("\nx* =",xstar)

## Display results
vstar = results[1][-1]
print("\nv* =",vstar)

## Convert into theta
print("\ntheta* =",np.arccos(xstar[0]),"(using arccos)")
print("\ntheta* =",np.arcsin(xstar[1]),"(using arcsin)")

## Visualize
visualize(results[0])

Iter. 	f(x) 		||h(x)|| 	||grad_L(x)|| 	||dx|| 		||dv|| 		delta_A 	delta_W
0   	-1.0621e+00 	0.0000e+00 	6.9215e-01 	 5.6411e-01 	 4.3032e-01 	 0.0000e+00 	 3.2768e+00
1   	-2.0216e+00 	3.1822e-01 	2.0226e+00 	 2.3706e-01 	 1.1375e+00 	 0.0000e+00 	 8.7381e+00
2   	-1.9886e+00 	5.6197e-02 	1.3940e+00 	 4.4244e-01 	 4.9825e-02 	 0.0000e+00 	 0.0000e+00
3   	-2.4244e+00 	1.9575e-01 	5.4298e-01 	 1.2207e-01 	 1.5280e-02 	 0.0000e+00 	 0.0000e+00
4   	-2.1566e+00 	1.4900e-02 	3.8186e-02 	 1.0366e-02 	 1.9242e-03 	 0.0000e+00 	 0.0000e+00
5   	-2.1325e+00 	1.0745e-04 	2.7129e-04 	 7.4929e-05 	 1.7543e-05 	 0.0000e+00 	 0.0000e+00
6   	-2.1323e+00 	5.6144e-09 	3.7221e-09 	 2.8396e-09 	 2.9955e-09 	 0.0000e+00 	 0.0000e+00

x* = [0.24215301 0.97023807]

v* = [1.64012795]

theta* = 1.3262120357119471 (using arccos)

theta* = 1.3262120357119473 (using arcsin)

../../_images/fa97e9f383ec2e242de1656abee9f8fb64bd599871e0a1c8ea90548524f4d634.png

Inertia-Corrected Netwon Method for Equality Constrained NLPs

Contents