此日志为参照Udacity课程中《Intro to tensorflow》的jupyter notebook所做的分解源码,目的在于理解代码逻辑,熟悉创建流程和套路。其中参考了不少博文链接,非常感谢,全部放在文末,在原文中不再指出。

数据链接:百度云:NoMNIST 密码:fsks

P1:预处理数据

import hashlib
import os
import pickle
from urllib.request import urlretrieve

import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import resample
from tqdm import tqdm
from zipfile import ZipFile

解压图片文件

def uncompress_features_labels(file):
    """
    Uncompress features and labels from a zip file
    :param file: The zip file to extract the data from
    """
    features = []
    labels = []

    with ZipFile(file) as zipf:
        # Progress Bar
        filenames_pbar = tqdm(zipf.namelist(), unit='files')
        
        # Get features and labels from all files
        for filename in filenames_pbar:
            # Check if the file is a directory
            if not filename.endswith('/'):
                with zipf.open(filename) as image_file:
                    image = Image.open(image_file)
                    image.load()
                    # Load image data as 1 dimensional array
                    # We're using float32 to save on memory space
                    feature = np.array(image, dtype=np.float32).flatten()

                # Get the the letter from the filename.  This is the letter of the image.
                label = os.path.split(filename)[1][0]

                features.append(feature)
                labels.append(label)
    return np.array(features), np.array(labels)

# Get the features and labels from the zip files
train_features, train_labels = uncompress_features_labels('notMNIST_train.zip')
test_features, test_labels = uncompress_features_labels('notMNIST_test.zip')

# Limit the amount of data to work with a docker container
docker_size_limit = 150000
train_features, train_labels = resample(train_features, train_labels, n_samples=docker_size_limit)

# Set flags for feature engineering.  This will prevent you from skipping an important step.
is_features_normal = False
is_labels_encod = False
100%|█████████████████████████████████████████████████████████████████████| 210001/210001 [00:54<00:00, 3832.78files/s]
100%|███████████████████████████████████████████████████████████████████████| 10001/10001 [00:03<00:00, 3207.15files/s]

Min-Max Scaling

Implement Min-Max scaling in the normalize_grayscale() function to a range of a=0.1 and b=0.9. After scaling, the values of the pixels in the input data should range from 0.1 to 0.9.

Since the raw notMNIST image data is in grayscale, the current values range from a min of 0 to a max of 255.

Min-Max Scaling:$X'=a+{\frac {\left(X-X_{\min }\right)\left(b-a\right)}{X_{\max }-X_{\min }}}$

def normalize_grayscale(image_data):
    """
    Normalize the image data with Min-Max scaling to a range of [0.1, 0.9]
    :param image_data: The image data to be normalized
    :return: Normalized image data
    """
 
    a = 0.1
    b = 0.9
    max_grayscale = 255
    min_grayscale = 0
    return a+((image_data-min_grayscale))*(b-a)/(max_grayscale-min_grayscale)
    
train_features = normalize_grayscale(train_features)
test_features = normalize_grayscale(test_features)

标签二值化

LabelBinarizer()是sklearn.preprocession中用来将非数值类标签转换为独热编码向量的函数。

# Create the encoder 创建编码器
encoder = LabelBinarizer()

# 编码器找到类别并分配 one-hot 向量
encoder.fit(train_labels)

#最后把目标(lables)转换成独热编码的(one-hot encoded)向量
train_labels = encoder.transform(train_labels)
test_labels = encoder.transform(test_labels)

转换数据类型,这样后面公式中才可以进行运算。

train_labels = train_labels.astype(np.float32)
test_labels = test_labels.astype(np.float32)

随机划分训练集和测试集

常见形式为:
X_train,X_test, y_train, y_test =cross_validation.train_test_split(train_data,train_target,test_size=0.4, random_state=0)

参数解释:

  • train_data:所要划分的样本特征集
  • train_target:所要划分的样本结果
  • test_size:样本占比,如果是整数的话就是样本的数量
  • random_state:是随机数的种子。

随机数种子:其实就是该组随机数的编号,在需要重复试验的时候,保证得到一组一样的随机数。比如你每次都填1,其他参数一样的情况下你得到的随机数组是一样的。但填0或不填,每次都会不一样。

# Get randomized datasets for training and validation
train_features, valid_features, train_labels, valid_labels = train_test_split(
    train_features,
    train_labels,
    test_size=0.05,
    random_state=832289)

打包数据方便下次取用

序列化的方法为 pickle.dump(),该方法的相关参数如下:
pickle.dump(obj, file, protocol=None,*,fix_imports=True)

# 新建pickle_file
# 参数file必须是以二进制的形式进行操作,即「wb」
pickle_file = 'notMNIST.pickle'
if not os.path.isfile(pickle_file):
    print('Saving data to pickle file...')
    try:
        with open('notMNIST.pickle', 'wb') as pfile:
            pickle.dump(
                {
                    'train_dataset': train_features,
                    'train_labels': train_labels,
                    'valid_dataset': valid_features,
                    'valid_labels': valid_labels,
                    'test_dataset': test_features,
                    'test_labels': test_labels,
                },
                pfile, pickle.HIGHEST_PROTOCOL)
    except Exception as e:
        print('Unable to save data to', pickle_file, ':', e)
        raise

P2:从预处理好的pickle中读取数据

%matplotlib inline

# Load the modules
import pickle
import math

import numpy as np
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt

# Reload the data
pickle_file = 'notMNIST.pickle'
with open(pickle_file, 'rb') as f:
  pickle_data = pickle.load(f)
  train_features = pickle_data['train_dataset']
  train_labels = pickle_data['train_labels']
  valid_features = pickle_data['valid_dataset']
  valid_labels = pickle_data['valid_labels']
  test_features = pickle_data['test_dataset']
  test_labels = pickle_data['test_labels']
  del pickle_data  # Free up memory
C:\Users\10677\Anaconda3\envs\keras\lib\site-packages\h5py\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters

使用TF创建单层神经网络

接下来,我们使用TensorFlow创建一个只有一个输入层和输出层的神经网络,激活函数为softmax
TensorFlow中,数据不是以整数、浮点数或字符串的形式存储的,而是以tensor对象的形式被存储的。

tensor中传递值有两种方法:

  • 使用tf.constant(),传入变量,但是传入之后就不可变了
  • 如果要使数据可变,结合tf.placeholder()tf.feed_dict来输入
# All the pixels in the image (28 * 28 = 784)
features_count = 784
# All the labels ("A,B...J")
labels_count = 10

features = tf.placeholder(tf.float32)
labels = tf.placeholder(tf.float32)

# Set the weights and biases tensors
# tf.truncated_normal:生成正态分布的随机值
# weights已经随机化,biases就不必随机,简化为0即可

weights = tf.Variable(tf.truncated_normal((features_count,labels_count)))
biases = tf.Variable(tf.zeros(labels_count))

# Feed dicts for training, validation, and test session
train_feed_dict = {features: train_features, labels: train_labels}
valid_feed_dict = {features: valid_features, labels: valid_labels}
test_feed_dict = {features: test_features, labels: test_labels}

# Linear Function WX + b
logits = tf.matmul(features, weights) + biases

prediction = tf.nn.softmax(logits)

# Cross entropy
cross_entropy = -tf.reduce_sum(labels * tf.log(prediction), reduction_indices=1)

# Training loss
loss = tf.reduce_mean(cross_entropy)

# Create an operation that initializes all variables
init = tf.global_variables_initializer()

# Test Cases
with tf.Session() as session:
    session.run(init)
    session.run(loss, feed_dict=train_feed_dict)
    session.run(loss, feed_dict=valid_feed_dict)
    session.run(loss, feed_dict=test_feed_dict)
    biases_data = session.run(biases)
is_correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(is_correct_prediction, tf.float32))

P3:训练神经网络

# Change if you have memory restrictions
batch_size = 128

# Find the best parameters for each configuration
epochs =  4
learning_rate = 0.2


# Gradient Descent
# 使用梯度下降进行训练
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)    

# The accuracy measured against the validation set
validation_accuracy = 0.0

# Measurements use for graphing loss and accuracy
log_batch_step = 50
batches = []
loss_batch = []
train_acc_batch = []
valid_acc_batch = []

with tf.Session() as session:
    session.run(init)
    batch_count = int(math.ceil(len(train_features)/batch_size))

    for epoch_i in range(epochs):
        
        # Progress bar
        batches_pbar = tqdm(range(batch_count), desc='Epoch {:>2}/{}'.format(epoch_i+1, epochs), unit='batches')
        
        # The training cycle
        for batch_i in batches_pbar:
            # Get a batch of training features and labels
            batch_start = batch_i*batch_size
            batch_features = train_features[batch_start:batch_start + batch_size]
            batch_labels = train_labels[batch_start:batch_start + batch_size]

            # Run optimizer and get loss
            _, l = session.run(
                [optimizer, loss],
                feed_dict={features: batch_features, labels: batch_labels})

            # Log every 50 batches
            if not batch_i % log_batch_step:
                # Calculate Training and Validation accuracy
                training_accuracy = session.run(accuracy, feed_dict=train_feed_dict)
                validation_accuracy = session.run(accuracy, feed_dict=valid_feed_dict)

                # Log batches
                previous_batch = batches[-1] if batches else 0
                batches.append(log_batch_step + previous_batch)
                loss_batch.append(l)
                train_acc_batch.append(training_accuracy)
                valid_acc_batch.append(validation_accuracy)

        # Check accuracy against Validation data
        validation_accuracy = session.run(accuracy, feed_dict=valid_feed_dict)

loss_plot = plt.subplot(211)
loss_plot.set_title('Loss')
loss_plot.plot(batches, loss_batch, 'g')
loss_plot.set_xlim([batches[0], batches[-1]])
acc_plot = plt.subplot(212)
acc_plot.set_title('Accuracy')
acc_plot.plot(batches, train_acc_batch, 'r', label='Training Accuracy')
acc_plot.plot(batches, valid_acc_batch, 'x', label='Validation Accuracy')
acc_plot.set_ylim([0, 1.0])
acc_plot.set_xlim([batches[0], batches[-1]])
acc_plot.legend(loc=4)
plt.tight_layout()
plt.show()

print('Validation accuracy at {}'.format(validation_accuracy))
Epoch  1/4: 100%|████████████████████████████████████████████████████████████| 1114/1114 [00:11<00:00, 101.27batches/s]
Epoch  2/4: 100%|████████████████████████████████████████████████████████████| 1114/1114 [00:10<00:00, 101.99batches/s]
Epoch  3/4: 100%|████████████████████████████████████████████████████████████| 1114/1114 [00:10<00:00, 101.38batches/s]
Epoch  4/4: 100%|█████████████████████████████████████████████████████████████| 1114/1114 [00:12<00:00, 92.55batches/s]

mark

Validation accuracy at 0.7662666440010071

P4:检测

test_accuracy = 0.0

with tf.Session() as session:
    
    session.run(init)
    batch_count = int(math.ceil(len(train_features)/batch_size))

    for epoch_i in range(epochs):
        
        # Progress bar
        batches_pbar = tqdm(range(batch_count), desc='Epoch {:>2}/{}'.format(epoch_i+1, epochs), unit='batches')
        
        # The training cycle
        for batch_i in batches_pbar:
            # Get a batch of training features and labels
            batch_start = batch_i*batch_size
            batch_features = train_features[batch_start:batch_start + batch_size]
            batch_labels = train_labels[batch_start:batch_start + batch_size]

            # Run optimizer
            _ = session.run(optimizer, feed_dict={features: batch_features, labels: batch_labels})

        # Check accuracy against Test data
        test_accuracy = session.run(accuracy, feed_dict=test_feed_dict)


assert test_accuracy >= 0.80, 'Test accuracy at {}, should be equal to or greater than 0.80'.format(test_accuracy)
print('Nice Job! Test Accuracy is {}'.format(test_accuracy))
Epoch  1/4: 100%|████████████████████████████████████████████████████████████| 1114/1114 [00:01<00:00, 588.57batches/s]
Epoch  2/4: 100%|████████████████████████████████████████████████████████████| 1114/1114 [00:01<00:00, 634.64batches/s]
Epoch  3/4: 100%|████████████████████████████████████████████████████████████| 1114/1114 [00:01<00:00, 633.74batches/s]
Epoch  4/4: 100%|████████████████████████████████████████████████████████████| 1114/1114 [00:01<00:00, 638.60batches/s]


Nice Job! Test Accuracy is 0.8468999862670898

参考链接: