吴恩达(Andrew Ng)深度学习(deep learning)编程练习题答案
本文章仅仅是为记录学习吴恩达老师 Deep Learning 课程,会不断更新。
第一门课《神经网络和深度学习》
第二周练习题答案
1.Python Basics with Numpy (optional assignment)
import math import numpy as np ### Building basic functions with numpy ## 1.1 - sigmoid function, np.exp() # GRADED FUNCTION: basic_sigmoid def basic_sigmoid(x): s = 1 / (1 + math.exp(-x)) return s # print(basic_sigmoid(3)) # End x = np.array([1, 2, 3]) # print(np.exp(x)) # print(x + 3) # GRADED FUNCTION: sigmoid def sigmoid(x): s = 1 / (1 + np.exp(-x)) return s res = sigmoid(x) # print(res) # End ## 1.2 - Sigmoid gradient # GRADED FUNCTION: sigmoid_derivative def sigmoid_derivative(x): """ Compute the gradient (also called the slope or derivative) of the sigmoid function with respect to its input x. You can store the output of the sigmoid function into variables and then use it to calculate the gradient. Arguments: x -- A scalar or numpy array Return: ds -- Your computed gradient. """ ### START CODE HERE ### (≈ 2 lines of code) s = sigmoid(x) ds = s * (1-s) ### END CODE HERE ### return ds x = np.array([1, 2, 3]) # print ("sigmoid_derivative(x) = " + str(sigmoid_derivative(x))) #End ## 1.3 - Reshaping arrays # GRADED FUNCTION: image2vector def image2vector(image): """ Argument: image -- a numpy array of shape (length, height, depth) Returns: v -- a vector of shape (length*height*depth, 1) """ ### START CODE HERE ### (≈ 1 line of code) # print(image.shape) v = image.reshape((image.shape[1] * image.shape[2] * 3, 1)) ### END CODE HERE ### return v # This is a 3 by 3 by 2 array, typically images will be (num_px_x, num_px_y,3) where 3 represents the RGB values image = np.array(( [ [0.67826139, 0.29380381], [0.90714982, 0.52835647], [0.4215251, 0.45017551] ], [ [0.92814219, 0.96677647], [0.85304703, 0.52351845], [0.19981397, 0.27417313] ], [ [0.60659855, 0.00533165], [0.10820313, 0.49978937], [0.34144279, 0.94630077] ] )) # print("image2vector(image) = " + str(image2vector(image))) # End ## 1.4 - Normalizing rows # GRADED FUNCTION: normalizeRows def normalizeRows(x): """ Implement a function that normalizes each row of the matrix x (to have unit length). Argument: x -- A numpy matrix of shape (n, m) Returns: x -- The normalized (by row) numpy matrix. You are allowed to modify x. """ ### START CODE HERE ### (≈ 2 lines of code) # Compute x_norm as the norm 2 of x. Use np.linalg.norm(..., ord = 2, axis = ..., keepdims = True) # ord = 2 平方根 求每行元素和的平方根 x_norm = np.linalg.norm(x, ord = 2, axis = 1, keepdims = True) # Divide x by its norm. x = x / x_norm ### END CODE HERE ### return x x = np.array([ [0, 3, 4], [1 ,6, 4] ]) # print("normalizeRows(x) = "+ str(normalizeRows(x))) # End ## 1.5 - Broadcasting and the softmax function # GRADED FUNCTION: softmax def softmax(x): """Calculates the softmax for each row of the input x. Your code should work for a row vector and also for matrices of shape (n, m). Argument: x -- A numpy matrix of shape (n,m) Returns: s -- A numpy matrix equal to the softmax of x, of shape (n,m) """ ### START CODE HERE ### (≈ 3 lines of code) # Apply exp() element-wise to x. Use np.exp(...). x_exp = np.exp(x) # Create a vector x_sum that sums each row of x_exp. Use np.sum(..., axis = 1, keepdims = True). x_sum = np.sum(x_exp, axis = 1, keepdims = True) # Compute softmax(x) by dividing x_exp by x_sum. It should automatically use numpy broadcasting. s = x_exp / x_sum ### END CODE HERE ### return s x = np.array([ [9, 2, 5, 0, 0], [7, 5, 0, 0, 0] ]) # print("softmax(x) = " + str(softmax(x))) # End ## 2.1 Implement the L1 and L2 loss functions # GRADED FUNCTION: L1 def L1(yhat, y): """ Arguments: yhat -- vector of size m (predicted labels) y -- vector of size m (true labels) Returns: loss -- the value of the L1 loss function defined above """ ### START CODE HERE ### (≈ 1 line of code) loss = np.sum(abs(y - yhat)) ### END CODE HERE ### return loss yhat = np.array([.9, 0.2, 0.1, .4, .9]) y = np.array([1, 0, 0, 1, 1]) # print("L1 = " + str(L1(yhat, y))) # End # GRADED FUNCTION: L2 def L2(yhat, y): """ Arguments: yhat -- vector of size m (predicted labels) y -- vector of size m (true labels) Returns: loss -- the value of the L2 loss function defined above """ ### START CODE HERE ### (≈ 1 line of code) loss = np.sum(np.power((y - yhat), 2)) ### END CODE HERE ### return loss yhat = np.array([.9, 0.2, 0.1, .4, .9]) y = np.array([1, 0, 0, 1, 1]) print("L2 = " + str(L2(yhat,y))) # End
2.Logistic Regression with a Neural Network mindset
import numpy as np import matplotlib.pyplot as plt import h5py import scipy from PIL import Image from scipy import ndimage from lr_utils import load_dataset # Loading the data (cat/non-cat) train_set_x_orig, train_set_y, test_set_x_orig, test_set_y, classes = load_dataset() # Example of a picture # index = 25 # plt.imshow(train_set_x_orig[index]) # print("y = " + str(train_set_y[:, index]) # + ", it's a '" # + classes[np.squeeze(train_set_y[:, index])].decode("utf-8") # + "' picture.") ### START CODE HERE ### (≈ 3 lines of code) m_train = train_set_x_orig.shape[0] m_test = test_set_x_orig.shape[0] num_px = train_set_x_orig.shape[1] ### END CODE HERE ### # print ("Number of training examples: m_train = " + str(m_train)) # print ("Number of testing examples: m_test = " + str(m_test)) # print ("Height/Width of each image: num_px = " + str(num_px)) # print ("Each image is of size: (" + str(num_px) + ", " + str(num_px) + ", 3)") # print ("train_set_x shape: " + str(train_set_x_orig.shape)) # print ("train_set_y shape: " + str(train_set_y.shape)) # print ("test_set_x shape: " + str(test_set_x_orig.shape)) # print ("test_set_y shape: " + str(test_set_y.shape)) # Reshape the training and test examples ### START CODE HERE ### (≈ 2 lines of code) train_set_x_flatten = train_set_x_orig.reshape(num_px * num_px * 3, m_train) test_set_x_flatten = test_set_x_orig.reshape(num_px * num_px * 3, m_test) ### END CODE HERE ### # print ("train_set_x_flatten shape: " + str(train_set_x_flatten.shape)) # print ("train_set_y shape: " + str(train_set_y.shape)) # print ("test_set_x_flatten shape: " + str(test_set_x_flatten.shape)) # print ("test_set_y shape: " + str(test_set_y.shape)) # print ("sanity check after reshaping: " + str(train_set_x_flatten[0:5,0])) train_set_x = train_set_x_flatten / 255 test_set_x = test_set_x_flatten / 255 # GRADED FUNCTION: sigmoid def sigmoid(z): s = 1 / (1 + np.exp(-z)) return s # print ("sigmoid([0, 2]) = " + str(sigmoid(np.array([0,2])))) # GRADED FUNCTION: initialize_with_zeros def initialize_with_zeros(dim): """ This function creates a vector of zeros of shape (dim, 1) for w and initializes b to 0. Argument: dim -- size of the w vector we want (or number of parameters in this case) Returns: w -- initialized vector of shape (dim, 1) b -- initialized scalar (corresponds to the bias) """ ### START CODE HERE ### (≈ 1 line of code) w = np.zeros((dim, 1)) b = 0. ### END CODE HERE ### assert (w.shape == (dim, 1)) assert (isinstance(b, float) or isinstance(b, int)) return w, b # dim = 2 # w, b = initialize_with_zeros(dim) # print ("w = " + str(w)) # print ("b = " + str(b)) # GRADED FUNCTION: propagate def propagate(w, b, X, Y): """ Implement the cost function and its gradient for the propagation explained above Arguments: w -- weights, a numpy array of size (num_px * num_px * 3, 1) b -- bias, a scalar X -- data of size (num_px * num_px * 3, number of examples) Y -- true "label" vector (containing 0 if non-cat, 1 if cat) of size (1, number of examples) Return: cost -- negative log-likelihood cost for logistic regression dw -- gradient of the loss with respect to w, thus same shape as w db -- gradient of the loss with respect to b, thus same shape as b Tips: - Write your code step by step for the propagation. np.log(), np.dot() """ m = X.shape[1] # FORWARD PROPAGATION (FROM X TO COST) ### START CODE HERE ### (≈ 2 lines of code) A = sigmoid(np.dot(w.T, X) + b) # compute activation # print(sigmoid(A)) cost = (-1 / m) * np.sum((Y * np.log(A)) + (1 - Y) * np.log(1 - A)) # compute cost dw = (1 / m) * np.dot(X, (A - Y).T) db = (1 / m) * np.sum(A - Y) assert (dw.shape == w.shape) assert (db.dtype == float) cost = np.squeeze(cost) assert (cost.shape == ()) grads = {"dw": dw, "db": db} return grads, cost # w, b, X, Y = np.array([[1],[2]]), 2, np.array([[1,2],[3,4]]), np.array([[1,0]]) # grads, cost = propagate(w, b, X, Y) # print ("dw = " + str(grads["dw"])) # print ("db = " + str(grads["db"])) # print ("cost = " + str(cost)) # GRADED FUNCTION: optimize def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost=False): """ This function optimizes w and b by running a gradient descent algorithm Arguments: w -- weights, a numpy array of size (num_px * num_px * 3, 1) b -- bias, a scalar X -- data of shape (num_px * num_px * 3, number of examples) Y -- true "label" vector (containing 0 if non-cat, 1 if cat), of shape (1, number of examples) num_iterations -- number of iterations of the optimization loop learning_rate -- learning rate of the gradient descent update rule print_cost -- True to print the loss every 100 steps Returns: params -- dictionary containing the weights w and bias b grads -- dictionary containing the gradients of the weights and bias with respect to the cost function costs -- list of all the costs computed during the optimization, this will be used to plot the learning curve. Tips: You basically need to write down two steps and iterate through them: 1) Calculate the cost and the gradient for the current parameters. Use propagate(). 2) Update the parameters using gradient descent rule for w and b. """ costs = [] for i in range(num_iterations): # Cost and gradient calculation (≈ 1-4 lines of code) ### START CODE HERE ### grads, cost = propagate(w, b, X, Y) ### END CODE HERE ### # Retrieve derivatives from grads dw = grads["dw"] db = grads["db"] # update rule (≈ 2 lines of code) ### START CODE HERE ### w = w - learning_rate * dw b = b - learning_rate * db ### END CODE HERE ### # Record the costs if i % 100 == 0: costs.append(cost) # Print the cost every 100 training examples if print_cost and i % 100 == 0: print("Cost after iteration %i: %f" % (i, cost)) params = {"w": w, "b": b} grads = {"dw": dw, "db": db} return params, grads, costs # params, grads, costs = optimize(w, b, X, Y, num_iterations= 100, learning_rate = 0.009, print_cost = False) # print ("w = " + str(params["w"])) # print ("b = " + str(params["b"])) # print ("dw = " + str(grads["dw"])) # print ("db = " + str(grads["db"])) # GRADED FUNCTION: predict def predict(w, b, X): ''' Predict whether the label is 0 or 1 using learned logistic regression parameters (w, b) Arguments: w -- weights, a numpy array of size (num_px * num_px * 3, 1) b -- bias, a scalar X -- data of size (num_px * num_px * 3, number of examples) Returns: Y_prediction -- a numpy array (vector) containing all predictions (0/1) for the examples in X ''' m = X.shape[1] Y_prediction = np.zeros((1, m)) w = w.reshape(X.shape[0], 1) # Compute vector "A" predicting the probabilities of a cat being present in the picture ### START CODE HERE ### (≈ 1 line of code) A = sigmoid(np.dot(w.T, X) + b) ### END CODE HERE ### for i in range(A.shape[1]): # Convert probabilities A[0,i] to actual predictions p[0,i] ### START CODE HERE ### (≈ 4 lines of code) Y_prediction[0, i] = A[0, i] >= 0.5 ### END CODE HERE ### assert (Y_prediction.shape == (1, m)) return Y_prediction # print ("predictions = " + str(predict(w, b, X))) # GRADED FUNCTION: model def model(X_train, Y_train, X_test, Y_test, num_iterations=2000, learning_rate=0.5, print_cost=False): """ Builds the logistic regression model by calling the function you've implemented previously Arguments: X_train -- training set represented by a numpy array of shape (num_px * num_px * 3, m_train) Y_train -- training labels represented by a numpy array (vector) of shape (1, m_train) X_test -- test set represented by a numpy array of shape (num_px * num_px * 3, m_test) Y_test -- test labels represented by a numpy array (vector) of shape (1, m_test) num_iterations -- hyperparameter representing the number of iterations to optimize the parameters learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize() print_cost -- Set to true to print the cost every 100 iterations Returns: d -- dictionary containing information about the model. """ ### START CODE HERE ### # initialize parameters with zeros (≈ 1 line of code) w, b = initialize_with_zeros(X_train.shape[0]) # Gradient descent (≈ 1 line of code) parameters, grads, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost) # Retrieve parameters w and b from dictionary "parameters" w = parameters["w"] b = parameters["b"] # Predict test/train set examples (≈ 2 lines of code) Y_prediction_test = predict(w, b, X_test) Y_prediction_train = predict(w, b, X_train) ### END CODE HERE ### # Print train/test Errors print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100)) print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100)) d = {"costs": costs, "Y_prediction_test": Y_prediction_test, "Y_prediction_train": Y_prediction_train, "w": w, "b": b, "learning_rate": learning_rate, "num_iterations": num_iterations} return d def testMyImage(fname): ''' Args: fname :the file name of picture ''' image = np.array(ndimage.imread(fname, flatten=False)) my_image = scipy.misc.imresize(image, size=(num_px, num_px)).reshape((1, num_px * num_px * 3)).T my_predicted_image = predict(d["w"], d["b"], my_image) plt.imshow(image) plt.axis("off") plt.show() print("y = " + str(np.squeeze(my_predicted_image)) + ", your algorithm predicts a \"" + classes[int(np.squeeze(my_predicted_image)),].decode("utf-8") + "\" picture.") if __name__ == '__main__': train_set_x_orig, train_set_y, test_set_x_orig, test_set_y, classes = load_dataset() # process data m_train = train_set_x_orig.shape[0] m_test = test_set_x_orig.shape[0] num_px = train_set_x_orig.shape[1] train_set_x_flatten = train_set_x_orig.reshape(train_set_x_orig.shape[0], -1).T # 保留第一个维度 test_set_x_flatten = test_set_x_orig.reshape(test_set_x_orig.shape[0], -1).T train_set_x = train_set_x_flatten / 255. test_set_x = test_set_x_flatten / 255. d = model(train_set_x, train_set_y, test_set_x, test_set_y, num_iterations=2000, learning_rate=0.005, print_cost=True) # Example of a picture that was wrongly classified. index = 1 plt.imshow(test_set_x[:, index].reshape((num_px, num_px, 3))) # print("y = " + str(test_set_y[0, index]) # + ", you predicted that it is a \"" # + classes[d["Y_prediction_test"][0, index]].decode("utf-8") # + "\" picture.") # Plot learning curve (with costs) costs = np.squeeze(d['costs']) plt.plot(costs) plt.ylabel('cost') plt.xlabel('iterations (per hundreds)') plt.title("Learning rate = " + str(d["learning_rate"])) plt.show() # Choice of learning rate # learning_rates = [0.01, 0.001, 0.0001] learning_rates = [0.01] models = {} for i in learning_rates: print("learning rate is: " + str(i)) models[str(i)] = model(train_set_x, train_set_y, test_set_x, test_set_y, num_iterations=1500, learning_rate = i, print_cost = False) print('\n' + "-------------------------------------------------------" + '\n') for i in learning_rates: plt.plot(np.squeeze(models[str(i)]["costs"]), label = str(models[str(i)]["learning_rate"])) plt.ylabel('cost') plt.xlabel('iterations') legend = plt.legend(loc = 'upper center', shadow = True) frame = legend.get_frame() frame.set_facecolor('0.90') plt.show() ## START CODE HERE ## (PUT YOUR IMAGE NAME) my_image = "cat.jpg" # change this to the name of your image file ## END CODE HERE ## # We preprocess the image to fit your algorithm. fname = "images/" + my_image testMyImage(fname)
3.Planar data classification with one hidden layer
第一课第三周练习题 week3
import numpy as np import matplotlib.pyplot as plt from testCases import * import sklearn import sklearn.datasets import sklearn.linear_model from planar_utils import plot_decision_boundary, sigmoid, load_planar_dataset, load_extra_datasets np.random.seed(1) X, Y = load_planar_dataset() # Visualize the data plt.scatter(X[0, :], X[1, :], c=Y, s=40, cmap=plt.cm.Spectral); ### START CODE HERE ### (≈ 3 lines of code) shape_X = X.shape shape_Y = Y.shape m = Y.shape[1] # training set size ### END CODE HERE ### # print ('The shape of X is: ' + str(shape_X)) # print ('The shape of Y is: ' + str(shape_Y)) # print ('I have m = %d training examples!' % (m)) # Train the logistic regression classifier clf = sklearn.linear_model.LogisticRegressionCV() clf.fit(X.T, Y.T) # Plot the decision boundary for logistic regression plot_decision_boundary(lambda x: clf.predict(x), X, Y) plt.title("Logistic Regression") # Print accuracy LR_predictions = clf.predict(X.T) # print ('Accuracy of logistic regression: %d ' % float((np.dot(Y,LR_predictions) + np.dot(1-Y,1-LR_predictions))/float(Y.size)*100) + # '% ' + "(percentage of correctly labelled datapoints)") # GRADED FUNCTION: layer_sizes def layer_sizes(X, Y): """ Arguments: X -- input dataset of shape (input size, number of examples) Y -- labels of shape (output size, number of examples) Returns: n_x -- the size of the input layer n_h -- the size of the hidden layer n_y -- the size of the output layer """ ### START CODE HERE ### (≈ 3 lines of code) n_x = X.shape[0] # size of input layer n_h = 4 n_y = Y.shape[0] # size of output layer ### END CODE HERE ### return (n_x, n_h, n_y) # X_assess, Y_assess = layer_sizes_test_case() # (n_x, n_h, n_y) = layer_sizes(X_assess, Y_assess) # print("The size of the input layer is: n_x = " + str(n_x)) # print("The size of the hidden layer is: n_h = " + str(n_h)) # print("The size of the output layer is: n_y = " + str(n_y)) # GRADED FUNCTION: initialize_parameters def initialize_parameters(n_x, n_h, n_y): """ Argument: n_x -- size of the input layer n_h -- size of the hidden layer n_y -- size of the output layer Returns: params -- python dictionary containing your parameters: W1 -- weight matrix of shape (n_h, n_x) b1 -- bias vector of shape (n_h, 1) W2 -- weight matrix of shape (n_y, n_h) b2 -- bias vector of shape (n_y, 1) """ np.random.seed(2) # we set up a seed so that your output matches ours although the initialization is random. ### START CODE HERE ### (≈ 4 lines of code) W1 = np.random.randn(n_h, n_x) * 0.01 b1 = np.zeros((n_h, 1)) W2 = np.random.randn(n_y, n_h) * 0.01 b2 = np.zeros((n_y, 1)) ### END CODE HERE ### assert (W1.shape == (n_h, n_x)) assert (b1.shape == (n_h, 1)) assert (W2.shape == (n_y, n_h)) assert (b2.shape == (n_y, 1)) parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2} return parameters # n_x, n_h, n_y = initialize_parameters_test_case() # parameters = initialize_parameters(n_x, n_h, n_y) # print("W1 = " + str(parameters["W1"])) # print("b1 = " + str(parameters["b1"])) # print("W2 = " + str(parameters["W2"])) # print("b2 = " + str(parameters["b2"])) def forward_propagation(X, parameters): """ Argument: X -- input data of size (n_x, m) parameters -- python dictionary containing your parameters (output of initialization function) Returns: A2 -- The sigmoid output of the second activation cache -- a dictionary containing "Z1", "A1", "Z2" and "A2" """ # Retrieve each parameter from the dictionary "parameters" ### START CODE HERE ### (≈ 4 lines of code) W1 = parameters['W1'] b1 = parameters['b1'] W2 = parameters['W2'] b2 = parameters['b2'] ### END CODE HERE ### # Implement Forward Propagation to calculate A2 (probabilities) ### START CODE HERE ### (≈ 4 lines of code) Z1 = np.dot(W1, X) + b1 A1 = np.tanh(Z1) Z2 = np.dot(W2, A1) + b2 A2 = sigmoid(Z2) ### END CODE HERE ### assert(A2.shape == (1, X.shape[1])) cache = {"Z1": Z1, "A1": A1, "Z2": Z2, "A2": A2} return A2, cache # X_assess, parameters = forward_propagation_test_case() # # A2, cache = forward_propagation(X_assess, parameters) # # # Note: we use the mean here just to make sure that your output matches ours. # print(np.mean(cache['Z1']) ,np.mean(cache['A1']),np.mean(cache['Z2']),np.mean(cache['A2'])) # GRADED FUNCTION: compute_cost def compute_cost(A2, Y, parameters): """ Computes the cross-entropy cost given in equation (13) Arguments: A2 -- The sigmoid output of the second activation, of shape (1, number of examples) Y -- "true" labels vector of shape (1, number of examples) parameters -- python dictionary containing your parameters W1, b1, W2 and b2 Returns: cost -- cross-entropy cost given equation (13) """ m = Y.shape[1] # number of example # Compute the cross-entropy cost ### START CODE HERE ### (≈ 2 lines of code) logprobs = np.multiply(np.log(A2), Y) + np.multiply(np.log(1 - A2), 1 -Y) cost = (-1 / m) * np.sum(logprobs) ### END CODE HERE ### cost = np.squeeze(cost) # makes sure cost is the dimension we expect. # E.g., turns [[17]] into 17 assert (isinstance(cost, float)) return cost # A2, Y_assess, parameters = compute_cost_test_case() # # print("cost = " + str(compute_cost(A2, Y_assess, parameters))) # GRADED FUNCTION: backward_propagation def backward_propagation(parameters, cache, X, Y): """ Implement the backward propagation using the instructions above. Arguments: parameters -- python dictionary containing our parameters cache -- a dictionary containing "Z1", "A1", "Z2" and "A2". X -- input data of shape (2, number of examples) Y -- "true" labels vector of shape (1, number of examples) Returns: grads -- python dictionary containing your gradients with respect to different parameters """ m = X.shape[1] # First, retrieve W1 and W2 from the dictionary "parameters". ### START CODE HERE ### (≈ 2 lines of code) W1 = parameters['W1'] W2 = parameters['W2'] ### END CODE HERE ### # Retrieve also A1 and A2 from dictionary "cache". ### START CODE HERE ### (≈ 2 lines of code) A1 = cache['A1'] A2 = cache['A2'] ### END CODE HERE ### # Backward propagation: calculate dW1, db1, dW2, db2. ### START CODE HERE ### (≈ 6 lines of code, corresponding to 6 equations on slide above) dZ2 = A2 - Y dW2 = np.dot(dZ2, A1.T) / m db2 = np.sum(dZ2, axis = 1, keepdims = True) / m dZ1 = np.multiply(np.dot(W2.T, dZ2) , (1 - np.power(A1, 2))) dW1 = np.dot(dZ1, X.T) / m db1 = np.sum(dZ1, axis = 1, keepdims = True) / m ### END CODE HERE ### grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2} return grads # parameters, cache, X_assess, Y_assess = backward_propagation_test_case() # # grads = backward_propagation(parameters, cache, X_assess, Y_assess) # print ("dW1 = "+ str(grads["dW1"])) # print ("db1 = "+ str(grads["db1"])) # print ("dW2 = "+ str(grads["dW2"])) # print ("db2 = "+ str(grads["db2"])) def update_parameters(parameters, grads, learning_rate = 1.2): """ Updates parameters using the gradient descent update rule given above Arguments: parameters -- python dictionary containing your parameters grads -- python dictionary containing your gradients Returns: parameters -- python dictionary containing your updated parameters """ # Retrieve each parameter from the dictionary "parameters" ### START CODE HERE ### (≈ 4 lines of code) W1 = parameters['W1'] b1 = parameters['b1'] W2 = parameters['W2'] b2 = parameters['b2'] ### END CODE HERE ### # Retrieve each gradient from the dictionary "grads" ### START CODE HERE ### (≈ 4 lines of code) dW1 = grads['dW1'] db1 = grads['db1'] dW2 = grads['dW2'] db2 = grads['db2'] ## END CODE HERE ### # Update rule for each parameter ### START CODE HERE ### (≈ 4 lines of code) W1 = W1 - learning_rate * dW1 b1 = b1 - learning_rate * db1 W2 = W2 - learning_rate * dW2 b2 = b2 - learning_rate * db2 ### END CODE HERE ### parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2} return parameters def nn_model(X, Y, n_h, num_iterations = 10000, print_cost=False): """ Arguments: X -- dataset of shape (2, number of examples) Y -- labels of shape (1, number of examples) n_h -- size of the hidden layer num_iterations -- Number of iterations in gradient descent loop print_cost -- if True, print the cost every 1000 iterations Returns: parameters -- parameters learnt by the model. They can then be used to predict. """ np.random.seed(3) n_x = layer_sizes(X, Y)[0] n_y = layer_sizes(X, Y)[2] # Initialize parameters, then retrieve W1, b1, W2, b2. Inputs: "n_x, n_h, n_y". Outputs = "W1, b1, W2, b2, parameters". ### START CODE HERE ### (≈ 5 lines of code) parameters = initialize_parameters(n_x, n_h, n_y) W1 = parameters['W1'] b1 = parameters['b1'] W2 = parameters['W2'] b2 = parameters['b2'] ### END CODE HERE ### # Loop (gradient descent) for i in range(0, num_iterations): ### START CODE HERE ### (≈ 4 lines of code) # Forward propagation. Inputs: "X, parameters". Outputs: "A2, cache". A2, cache = forward_propagation(X, parameters) # Cost function. Inputs: "A2, Y, parameters". Outputs: "cost". cost = compute_cost(A2, Y, parameters) # Backpropagation. Inputs: "parameters, cache, X, Y". Outputs: "grads". grads = backward_propagation(parameters, cache, X, Y) # Gradient descent parameter update. Inputs: "parameters, grads". Outputs: "parameters". parameters = update_parameters(parameters, grads) ### END CODE HERE ### # Print the cost every 1000 iterations if print_cost and i % 1000 == 0: print ("Cost after iteration %i: %f" %(i, cost)) return parameters def predict(parameters, X): """ Using the learned parameters, predicts a class for each example in X Arguments: parameters -- python dictionary containing your parameters X -- input data of size (n_x, m) Returns predictions -- vector of predictions of our model (red: 0 / blue: 1) """ # Computes probabilities using forward propagation, and classifies to 0/1 using 0.5 as the threshold. ### START CODE HERE ### (≈ 2 lines of code) A2, cache = forward_propagation(X, parameters) predictions = np.around(A2) ### END CODE HERE ### return predictions # parameters, X_assess = predict_test_case() # # predictions = predict(parameters, X_assess) # print("predictions mean = " + str(np.mean(predictions))) # Build a model with a n_h-dimensional hidden layer # parameters = nn_model(X, Y, n_h = 4, num_iterations = 10000, print_cost=True) # Plot the decision boundary # plot_decision_boundary(lambda x: predict(parameters, x.T), X, Y) # plt.title("Decision Boundary for hidden layer size " + str(4)) # Print accuracy # predictions = predict(parameters, X) # print ('Accuracy: %d' % float((np.dot(Y,predictions.T) + np.dot(1-Y,1-predictions.T))/float(Y.size)*100) + '%') # plt.show() # This may take about 2 minutes to run # plt.figure(figsize=(16, 32)) # hidden_layer_sizes = [1, 2, 3, 4, 5, 20, 50] # for i, n_h in enumerate(hidden_layer_sizes): # plt.subplot(5, 2, i+1) # plt.title('Hidden Layer of size %d' % n_h) # parameters = nn_model(X, Y, n_h, num_iterations = 5000) # plot_decision_boundary(lambda x: predict(parameters, x.T), X, Y) # predictions = predict(parameters, X) # accuracy = float((np.dot(Y,predictions.T) + np.dot(1-Y,1-predictions.T))/float(Y.size)*100) # print ("Accuracy for {} hidden units: {} %".format(n_h, accuracy)) # Datasets noisy_circles, noisy_moons, blobs, gaussian_quantiles, no_structure = load_extra_datasets() datasets = {"noisy_circles": noisy_circles, "noisy_moons": noisy_moons, "blobs": blobs, "gaussian_quantiles": gaussian_quantiles} ### START CODE HERE ### (choose your dataset) dataset = "noisy_moons" ### END CODE HERE ### X, Y = datasets[dataset] X, Y = X.T, Y.reshape(1, Y.shape[0]) # make blobs binary if dataset == "blobs": Y = Y%2 # Visualize the data plt.scatter(X[0, :], X[1, :], c=Y, s=40, cmap=plt.cm.Spectral); plt.show()
第一课第四周练习题
4.Building your Deep Neural Network
参考:http://blog.csdn.net/justry24/article/details/77994038
import numpy as np import h5py import matplotlib.pyplot as plt from testCases_v2 import * from dnn_utils_v2 import sigmoid, sigmoid_backward, relu, relu_backward plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots plt.rcParams['image.interpolation'] = 'nearest' plt.rcParams['image.cmap'] = 'gray' # GRADED FUNCTION: initialize_parameters def initialize_parameters(n_x, n_h, n_y): """ Argument: n_x -- size of the input layer n_h -- size of the hidden layer n_y -- size of the output layer Returns: parameters -- python dictionary containing your parameters: W1 -- weight matrix of shape (n_h, n_x) b1 -- bias vector of shape (n_h, 1) W2 -- weight matrix of shape (n_y, n_h) b2 -- bias vector of shape (n_y, 1) """ np.random.seed(1) ### START CODE HERE ### (≈ 4 lines of code) W1 = np.random.randn(n_h, n_x) * 0.01 b1 = np.zeros((n_h, 1)) W2 = np.random.randn(n_y, n_h) * 0.01 b2 = np.zeros((n_y, 1)) ### END CODE HERE ### assert (W1.shape == (n_h, n_x)) assert (b1.shape == (n_h, 1)) assert (W2.shape == (n_y, n_h)) assert (b2.shape == (n_y, 1)) parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2} return parameters # GRADED FUNCTION: initialize_parameters_deep def initialize_parameters_deep(layer_dims): """ Arguments: layer_dims -- python array (list) containing the dimensions of each layer in our network Returns: parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL": Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1]) bl -- bias vector of shape (layer_dims[l], 1) """ np.random.seed(3) parameters = {} L = len(layer_dims) # number of layers in the network for l in range(1, L): ### START CODE HERE ### (≈ 2 lines of code) parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01 parameters['b' + str(l)] = np.zeros((layer_dims[l], 1)) ### END CODE HERE ### assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l-1])) assert(parameters['b' + str(l)].shape == (layer_dims[l], 1)) return parameters # GRADED FUNCTION: linear_forward def linear_forward(A, W, b): """ Implement the linear part of a layer's forward propagation. Arguments: A -- activations from previous layer (or input data): (size of previous layer, number of examples) W -- weights matrix: numpy array of shape (size of current layer, size of previous layer) b -- bias vector, numpy array of shape (size of the current layer, 1) Returns: Z -- the input of the activation function, also called pre-activation parameter cache -- a python dictionary containing "A", "W" and "b" ; stored for computing the backward pass efficiently """ ### START CODE HERE ### (≈ 1 line of code) Z = np.dot(W, A) + b ### END CODE HERE ### assert(Z.shape == (W.shape[0], A.shape[1])) cache = (A, W, b) return Z, cache # GRADED FUNCTION: linear_activation_forward def linear_activation_forward(A_prev, W, b, activation): """ Implement the forward propagation for the LINEAR->ACTIVATION layer Arguments: A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples) W -- weights matrix: numpy array of shape (size of current layer, size of previous layer) b -- bias vector, numpy array of shape (size of the current layer, 1) activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu" Returns: A -- the output of the activation function, also called the post-activation value cache -- a python dictionary containing "linear_cache" and "activation_cache"; stored for computing the backward pass efficiently """ if activation == "sigmoid": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache". ### START CODE HERE ### (≈ 2 lines of code) Z, linear_cache = linear_forward(A_prev, W, b) A, activation_cache = sigmoid(Z) ### END CODE HERE ### elif activation == "relu": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache". ### START CODE HERE ### (≈ 2 lines of code) Z, linear_cache = linear_forward(A_prev, W, b) A, activation_cache = relu(Z) ### END CODE HERE ### assert (A.shape == (W.shape[0], A_prev.shape[1])) cache = (linear_cache, activation_cache) return A, cache # GRADED FUNCTION: L_model_forward def L_model_forward(X, parameters): """ Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation Arguments: X -- data, numpy array of shape (input size, number of examples) parameters -- output of initialize_parameters_deep() Returns: AL -- last post-activation value caches -- list of caches containing: every cache of linear_relu_forward() (there are L-1 of them, indexed from 0 to L-2) the cache of linear_sigmoid_forward() (there is one, indexed L-1) """ caches = [] A = X L = len(parameters) // 2 # number of layers in the neural network # Implement [LINEAR -> RELU]*(L-1). Add "cache" to the "caches" list. for l in range(1, L): A_prev = A ### START CODE HERE ### (≈ 2 lines of code) A, cache = linear_activation_forward(A_prev , parameters["W" + str(l)], parameters["b" + str(l)], activation = 'relu') caches.append(cache) ### END CODE HERE ### # Implement LINEAR -> SIGMOID. Add "cache" to the "caches" list. ### START CODE HERE ### (≈ 2 lines of code) AL, cache = linear_activation_forward(A, parameters["W" + str(L)], parameters["b" + str(L)], activation = 'sigmoid') caches.append(cache) ### END CODE HERE ### assert(AL.shape == (1,X.shape[1])) return AL, caches # GRADED FUNCTION: compute_cost def compute_cost(AL, Y): """ Implement the cost function defined by equation (7). Arguments: AL -- probability vector corresponding to your label predictions, shape (1, number of examples) Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples) Returns: cost -- cross-entropy cost """ m = Y.shape[1] # Compute loss from aL and y. ### START CODE HERE ### (≈ 1 lines of code) cost = - np.sum(np.multiply(np.log(AL),Y) + np.multiply(np.log(1 - AL),1 - Y)) / m ### END CODE HERE ### cost = np.squeeze(cost) # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17). assert(cost.shape == ()) return cost # GRADED FUNCTION: linear_backward def linear_backward(dZ, cache): """ Implement the linear portion of backward propagation for a single layer (layer l) Arguments: dZ -- Gradient of the cost with respect to the linear output (of current layer l) cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer Returns: dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev dW -- Gradient of the cost with respect to W (current layer l), same shape as W db -- Gradient of the cost with respect to b (current layer l), same shape as b """ A_prev, W, b = cache m = A_prev.shape[1] ### START CODE HERE ### (≈ 3 lines of code) dW = np.dot(dZ, A_prev.T) / m db = np.sum(dZ, axis = 1, keepdims = True) / m dA_prev = np.dot(W.T, dZ) ### END CODE HERE ### assert (dA_prev.shape == A_prev.shape) assert (dW.shape == W.shape) assert (db.shape == b.shape) return dA_prev, dW, db # GRADED FUNCTION: linear_activation_backward def linear_activation_backward(dA, cache, activation): """ Implement the backward propagation for the LINEAR->ACTIVATION layer. Arguments: dA -- post-activation gradient for current layer l cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu" Returns: dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev dW -- Gradient of the cost with respect to W (current layer l), same shape as W db -- Gradient of the cost with respect to b (current layer l), same shape as b """ linear_cache, activation_cache = cache if activation == "relu": ### START CODE HERE ### (≈ 2 lines of code) dZ = relu_backward(dA, activation_cache) dA_prev, dW, db = linear_backward(dZ, linear_cache) ### END CODE HERE ### elif activation == "sigmoid": ### START CODE HERE ### (≈ 2 lines of code) dZ = sigmoid_backward(dA, activation_cache) dA_prev, dW, db = linear_backward(dZ, linear_cache) ### END CODE HERE ### return dA_prev, dW, db # GRADED FUNCTION: L_model_backward def L_model_backward(AL, Y, caches): """ Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group Arguments: AL -- probability vector, output of the forward propagation (L_model_forward()) Y -- true "label" vector (containing 0 if non-cat, 1 if cat) caches -- list of caches containing: every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2) the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1]) Returns: grads -- A dictionary with the gradients grads["dA" + str(l)] = ... grads["dW" + str(l)] = ... grads["db" + str(l)] = ... """ grads = {} L = len(caches) # the number of layers m = AL.shape[1] Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL # Initializing the backpropagation ### START CODE HERE ### (1 line of code) dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL)) ### END CODE HERE ### # Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "AL, Y, caches". Outputs: "grads["dAL"], grads["dWL"], grads["dbL"] ### START CODE HERE ### (approx. 2 lines) current_cache = caches[L-1] grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "sigmoid") ### END CODE HERE ### for l in reversed(range(L-1)): # lth layer: (RELU -> LINEAR) gradients. # Inputs: "grads["dA" + str(l + 2)], caches". Outputs: "grads["dA" + str(l + 1)] , grads["dW" + str(l + 1)] , grads["db" + str(l + 1)] ### START CODE HERE ### (approx. 5 lines) current_cache = caches[l] dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l + 2)], current_cache, activation = "relu") grads["dA" + str(l + 1)] = dA_prev_temp grads["dW" + str(l + 1)] = dW_temp grads["db" + str(l + 1)] = db_temp ### END CODE HERE ### return grads # GRADED FUNCTION: update_parameters def update_parameters(parameters, grads, learning_rate): """ Update parameters using gradient descent Arguments: parameters -- python dictionary containing your parameters grads -- python dictionary containing your gradients, output of L_model_backward Returns: parameters -- python dictionary containing your updated parameters parameters["W" + str(l)] = ... parameters["b" + str(l)] = ... """ L = len(parameters) // 2 # number of layers in the neural network # Update rule for each parameter. Use a for loop. ### START CODE HERE ### (≈ 3 lines of code) for l in range(L): parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)] parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)] ### END CODE HERE ### return parameters def two_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False): """ Implements a two-layer neural network: LINEAR->RELU->LINEAR->SIGMOID. Arguments: X -- input data, of shape (n_x, number of examples) Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples) layers_dims -- dimensions of the layers (n_x, n_h, n_y) num_iterations -- number of iterations of the optimization loop learning_rate -- learning rate of the gradient descent update rule print_cost -- If set to True, this will print the cost every 100 iterations Returns: parameters -- a dictionary containing W1, W2, b1, and b2 """ np.random.seed(1) grads = {} costs = [] # to keep track of the cost m = X.shape[1] # number of examples (n_x, n_h, n_y) = layers_dims # Initialize parameters dictionary, by calling one of the functions you'd previously implemented ### START CODE HERE ### (≈ 1 line of code) parameters = initialize_parameters(n_x, n_h, n_y) ### END CODE HERE ### # Get W1, b1, W2 and b2 from the dictionary parameters. W1 = parameters["W1"] b1 = parameters["b1"] W2 = parameters["W2"] b2 = parameters["b2"] # Loop (gradient descent) for i in range(0, num_iterations): # Forward propagation: LINEAR -> RELU -> LINEAR -> SIGMOID. Inputs: "X, W1, b1". Output: "A1, cache1, A2, cache2". ### START CODE HERE ### (≈ 2 lines of code) A1, cache1 = linear_activation_forward(X, W1, b1, 'relu') A2, cache2 = linear_activation_forward(A1, W2, b2, 'sigmoid') ### END CODE HERE ### # Compute cost ### START CODE HERE ### (≈ 1 line of code) cost = compute_cost(A2, Y) ### END CODE HERE ### # Initializing backward propagation dA2 = - (np.divide(Y, A2) - np.divide(1 - Y, 1 - A2)) # Backward propagation. Inputs: "dA2, cache2, cache1". Outputs: "dA1, dW2, db2; also dA0 (not used), dW1, db1". ### START CODE HERE ### (≈ 2 lines of code) dA1, dW2, db2 = linear_activation_backward(dA2, cache2, 'sigmoid') dA0, dW1, db1 = linear_activation_backward(dA1, cache1, 'relu') ### END CODE HERE ### # Set grads['dWl'] to dW1, grads['db1'] to db1, grads['dW2'] to dW2, grads['db2'] to db2 grads['dW1'] = dW1 grads['db1'] = db1 grads['dW2'] = dW2 grads['db2'] = db2 # Update parameters. ### START CODE HERE ### (approx. 1 line of code) parameters = update_parameters(parameters, grads, learning_rate) ### END CODE HERE ### # Retrieve W1, b1, W2, b2 from parameters W1 = parameters["W1"] b1 = parameters["b1"] W2 = parameters["W2"] b2 = parameters["b2"] # Print the cost every 100 training example if print_cost and i % 100 == 0: print("Cost after iteration {}: {}".format(i, np.squeeze(cost))) if print_cost and i % 100 == 0: costs.append(cost) # plot the cost plt.plot(np.squeeze(costs)) plt.ylabel('cost') plt.xlabel('iterations (per tens)') plt.title("Learning rate =" + str(learning_rate)) plt.show() return parameters def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):#lr was 0.009 """ Implements a L-layer neural network: [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID. Arguments: X -- data, numpy array of shape (number of examples, num_px * num_px * 3) Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples) layers_dims -- list containing the input size and each layer size, of length (number of layers + 1). learning_rate -- learning rate of the gradient descent update rule num_iterations -- number of iterations of the optimization loop print_cost -- if True, it prints the cost every 100 steps Returns: parameters -- parameters learnt by the model. They can then be used to predict. """ np.random.seed(1) costs = [] # keep track of cost # Parameters initialization. ### START CODE HERE ### parameters = initialize_parameters_deep(layers_dims) ### END CODE HERE ### # Loop (gradient descent) for i in range(0, num_iterations): # Forward propagation: [LINEAR -> RELU]*(L-1) -> LINEAR -> SIGMOID. ### START CODE HERE ### (≈ 1 line of code) AL, caches = L_model_forward(X, parameters) ### END CODE HERE ### # Compute cost. ### START CODE HERE ### (≈ 1 line of code) cost = compute_cost(AL, Y) ### END CODE HERE ### # Backward propagation. ### START CODE HERE ### (≈ 1 line of code) grads = L_model_backward(AL, Y, caches) ### END CODE HERE ### # Update parameters. ### START CODE HERE ### (≈ 1 line of code) parameters = update_parameters(parameters, grads, learning_rate) ### END CODE HERE ### # Print the cost every 100 training example if print_cost and i % 100 == 0: print ("Cost after iteration %i: %f" %(i, cost)) if print_cost and i % 100 == 0: costs.append(cost) # plot the cost plt.plot(np.squeeze(costs)) plt.ylabel('cost') plt.xlabel('iterations (per tens)') plt.title("Learning rate =" + str(learning_rate)) plt.show() return parameters # parameters, grads = update_parameters_test_case() # parameters = update_parameters(parameters, grads, 0.1) # print ("W1 = "+ str(parameters["W1"])) # print ("b1 = "+ str(parameters["b1"])) # print ("W2 = "+ str(parameters["W2"])) # print ("b2 = "+ str(parameters["b2"]))