Model Building, Training and Evaluating - 1.1 English

AI Optimizer User Guide (UG1333)

Document ID
UG1333
Release Date
2020-07-07
Version
1.1 English

Create a file called train_eval_utils.py, and add the following code:

import os, time, sys
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

import tensorflow as tf

from low_level_cnn import net_fn
from data_utils import get_init_data

class ConvNet(object):
  def __init__(self, training=True):
    self.lr = 0.001
    self.train_batch = 128
    self.test_batch = 100
    self.keep_prob = tf.constant(0.75)
    self.gstep = tf.Variable(0, dtype=tf.int64, trainable=False, name='global_step')
    self.n_classes = 10
    self.skip_step = 100
    self.n_test = 10000
    self.training = training

  def loss(self):
    '''
    define loss function
    use softmax cross entropy with logits as the loss function
    compute mean cross entropy, softmax is applied internally
    '''
    with tf.name_scope('loss'):
      entropy = tf.nn.softmax_cross_entropy_with_logits(labels=self.label, logits=self.logits)
      self.loss = tf.reduce_mean(entropy, name='loss')

  def optimize(self):
    '''
    Define training op
    using Adam optimizer to minimize cost
    '''
    self.opt = tf.train.AdamOptimizer(self.lr).minimize(self.loss, global_step=self.gstep)

  def eval(self):
    '''
    Count the number of right predictions in a batch
    '''
    with tf.name_scope('predict'):
      preds = tf.nn.softmax(self.logits)
      correct_preds = tf.equal(tf.argmax(preds, 1), tf.argmax(self.label, 1))
      self.accuracy = tf.reduce_sum(tf.cast(correct_preds, tf.float32))

  def summary(self):
    '''
    Create summaries to write on TensorBoard
    '''
    with tf.name_scope('summaries'):
      tf.summary.scalar('accuracy', self.accuracy)
      if self.training:
        tf.summary.scalar('loss', self.loss)
        tf.summary.histogram('histogram_loss', self.loss)
      self.summary_op = tf.summary.merge_all()

  def build(self, test_only=False):
    '''
    Build the computation graph
    '''
    self.img, self.label, self.train_init, self.test_init = \
            get_init_data(self.train_batch, self.test_batch, test_only=test_only)

    self.logits = net_fn(self.img, n_classes=self.n_classes, \
            keep_prob=self.keep_prob, is_training=self.training)
    if self.training:
      self.loss()
      self.optimize()
    self.eval()
    self.summary()

  def train_one_epoch(self, sess, saver, writer, epoch, step):
    start_time = time.time()
    sess.run(self.train_init)
    total_loss = 0
    n_batches = 0
    tf.logging.info(time.strftime('time:%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
    try:
      while True:
        _, l, summaries = sess.run([self.opt, self.loss, self.summary_op])
        writer.add_summary(summaries, global_step=step)
        if (step + 1) % self.skip_step == 0:
          tf.logging.info('Loss at step {0}: {1}'.format(step+1, l))
        step += 1
        total_loss += l
        n_batches += 1
    except tf.errors.OutOfRangeError:
      pass
    #saver.save(sess, 'checkpoints/convnet_mnist/mnist-convnet', step)
    tf.logging.info('Average loss at epoch {0}: {1}'.format(epoch, total_loss/n_batches))
    tf.logging.info('train one epoch took: {0} seconds'.format(time.time() - start_time))
    return step

  def eval_once(self, sess, writer=None, step=None):
    start_time = time.time()
    sess.run(self.test_init)
    total_correct_preds = 0
    eval_step = 0
    try:
      while True:
        eval_step += 1
        accuracy_batch, summaries = sess.run([self.accuracy, self.summary_op])
        writer.add_summary(summaries, global_step=step) if writer else None
        total_correct_preds += accuracy_batch
    except tf.errors.OutOfRangeError:
      pass
    tf.logging.info('Evaluation took: {0} seconds'.format(time.time() - start_time))
    tf.logging.info('Accuracy : {0} \n'.format(total_correct_preds/self.n_test))

  def train_eval(self, n_epochs=10, save_ckpt=None, restore_ckpt=None):
    '''
    The train function alternates between training one epoch and evaluating
    '''
    if restore_ckpt:
      writer = tf.summary.FileWriter('./graphs/convnet/finetune', tf.get_default_graph())
    else:
      writer = tf.summary.FileWriter('./graphs/convnet/train', tf.get_default_graph())
    with tf.Session() as sess:
      sess.run(tf.global_variables_initializer())
      saver = tf.train.Saver()
      if restore_ckpt:
        saver.restore(sess, restore_ckpt)
      step = self.gstep.eval()
      for epoch in range(n_epochs):
        step = self.train_one_epoch(sess, saver, writer, epoch, step)
        self.eval_once(sess, writer, step)
      saver.save(sess, save_ckpt)
    writer.close()
    tf.logging.info("Finish")

  def evaluate(self, restore_ckpt):
    '''
    The evaluating function
    '''
    with tf.Session() as sess:
      saver = tf.train.Saver()
      saver.restore(sess, restore_ckpt)
      step = self.gstep.eval()
      self.eval_once(sess)
    tf.logging.info("Finish")

ConvNet is a class which can build graph and train & evaluate model. It is a framework by combining the data utils, net definition and metrics. You can train and evaluate model by instantiate a ConvNet class, then calling class method build to build train or evaluate graph by specify whether argument test_only is true.