TensorFlow CNN for Fixed‑Length ID Card Number OCR
This article demonstrates how to build a TensorFlow‑based CNN to recognize fixed‑length 18‑digit Chinese ID card numbers, covering environment setup, synthetic data generation, model architecture, training procedure, and achieved accuracy of over 84%.
The author explores OCR for Chinese ID cards, focusing on a concrete goal: recognizing the fixed‑length 18‑digit ID number using an end‑to‑end convolutional neural network (CNN) implemented with TensorFlow.
Environment dependencies – The project relies on TensorFlow (recommended via Anaconda), freetype‑py for on‑the‑fly image generation, and common libraries such as NumPy and OpenCV.
<code>pip install freetype-py
pip install numpy cv2</code>Synthetic training data generation – Images of size 32×256 are generated with freetype‑py . Each image encodes an 18‑character numeric string; the label is converted to a one‑hot vector of length 180 (18 positions × 10 possible digits). The following Python classes implement text rendering and data creation.
<code>#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
身份证文字+数字生成类
@author: pengyuanjie
"""
import numpy as np
import freetype
import copy
import random
import cv2
class put_chinese_text(object):
def __init__(self, ttf):
self._face = freetype.Face(ttf)
def draw_text(self, image, pos, text, text_size, text_color):
'''
draw chinese(or not) text with ttf
:param image: image(numpy.ndarray) to draw text
:param pos: where to draw text
:param text: the context, for chinese should be unicode type
:param text_size: text size
:param text_color:text color
:return: image
'''
self._face.set_char_size(text_size * 64)
metrics = self._face.size
ascender = metrics.ascender/64.0
ypos = int(ascender)
if not isinstance(text, unicode):
text = text.decode('utf-8')
img = self.draw_string(image, pos[0], pos[1]+ypos, text, text_color)
return img
def draw_string(self, img, x_pos, y_pos, text, color):
'''
draw string
:param x_pos: text x-postion on img
:param y_pos: text y-postion on img
:param text: text (unicode)
:param color: text color
:return: image
'''
prev_char = 0
pen = freetype.Vector()
pen.x = x_pos << 6 # div 64
pen.y = y_pos << 6
hscale = 1.0
matrix = freetype.Matrix(int(hscale)*0x10000L, int(0.2*0x10000L),
int(0.0*0x10000L), int(1.1*0x10000L))
cur_pen = freetype.Vector()
pen_translate = freetype.Vector()
image = copy.deepcopy(img)
for cur_char in text:
self._face.set_transform(matrix, pen_translate)
self._face.load_char(cur_char)
kerning = self._face.get_kerning(prev_char, cur_char)
pen.x += kerning.x
slot = self._face.glyph
bitmap = slot.bitmap
cur_pen.x = pen.x
cur_pen.y = pen.y - slot.bitmap_top * 64
self.draw_ft_bitmap(image, bitmap, cur_pen, color)
pen.x += slot.advance.x
prev_char = cur_char
return image
def draw_ft_bitmap(self, img, bitmap, pen, color):
'''
draw each char
:param bitmap: bitmap
:param pen: pen
:param color: pen color e.g.(0,0,255) - red
:return: image
'''
x_pos = pen.x >> 6
y_pos = pen.y >> 6
cols = bitmap.width
rows = bitmap.rows
glyph_pixels = bitmap.buffer
for row in range(rows):
for col in range(cols):
if glyph_pixels[row*cols + col] != 0:
img[y_pos + row][x_pos + col][0] = color[0]
img[y_pos + row][x_pos + col][1] = color[1]
img[y_pos + row][x_pos + col][2] = color[2]
class gen_id_card(object):
def __init__(self):
self.number = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
self.char_set = self.number
self.len = len(self.char_set)
self.max_size = 18
self.ft = put_chinese_text('fonts/OCR-B.ttf')
def random_text(self):
text = ''
vecs = np.zeros((self.max_size * self.len))
size = self.max_size
for i in range(size):
c = random.choice(self.char_set)
vec = self.char2vec(c)
text = text + c
vecs[i*self.len:(i+1)*self.len] = np.copy(vec)
return text, vecs
def gen_image(self):
text, vec = self.random_text()
img = np.zeros([32,256,3])
color_ = (255,255,255) # Write
pos = (0, 0)
text_size = 21
image = self.ft.draw_text(img, pos, text, text_size, color_)
return image[:,:,2], text, vec
def char2vec(self, c):
vec = np.zeros((self.len))
for j in range(self.len):
if self.char_set[j] == c:
vec[j] = 1
return vec
def vec2text(self, vecs):
text = ''
v_len = len(vecs)
for i in range(v_len):
if(vecs[i] == 1):
text = text + self.char_set[i % self.len]
return text
if __name__ == '__main__':
genObj = gen_id_card()
image_data, label, vec = genObj.gen_image()
cv2.imshow('image', image_data)
cv2.waitKey(0)
</code>Batch generation – A helper function creates a batch of synthetic images and their corresponding one‑hot vectors for training.
<code># 生成一个训练batch
def get_next_batch(batch_size=128):
obj = gen_id_card()
batch_x = np.zeros([batch_size, IMAGE_HEIGHT*IMAGE_WIDTH])
batch_y = np.zeros([batch_size, MAX_CAPTCHA*CHAR_SET_LEN])
for i in range(batch_size):
image, text, vec = obj.gen_image()
batch_x[i,:] = image.reshape((IMAGE_HEIGHT*IMAGE_WIDTH))
batch_y[i,:] = vec
return batch_x, batch_y
</code>Batch Normalization utility – A TensorFlow implementation of batch normalization is provided for use in the network.
<code>def batch_norm(x, beta, gamma, phase_train, scope='bn', decay=0.9, eps=1e-5):
with tf.variable_scope(scope):
batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name='moments')
ema = tf.train.ExponentialMovingAverage(decay=decay)
def mean_var_with_update():
ema_apply_op = ema.apply([batch_mean, batch_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(batch_mean), tf.identity(batch_var)
mean, var = tf.cond(phase_train, mean_var_with_update,
lambda: (ema.average(batch_mean), ema.average(batch_var)))
normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, eps)
return normed
</code>Model architecture – The CNN consists of four convolutional layers (two 5×5 and two 3×3 kernels), each followed by batch normalization, ReLU activation, max‑pooling, and dropout, then a fully connected layer and an output layer that predicts the one‑hot encoding of the 18‑digit sequence.
<code>def crack_captcha_cnn(w_alpha=0.01, b_alpha=0.1):
x = tf.reshape(X, shape=[-1, IMAGE_HEIGHT, IMAGE_WIDTH, 1])
# 4 conv layer
w_c1 = tf.Variable(w_alpha*tf.random_normal([5, 5, 1, 32]))
b_c1 = tf.Variable(b_alpha*tf.random_normal([32]))
conv1 = tf.nn.bias_add(tf.nn.conv2d(x, w_c1, strides=[1, 1, 1, 1], padding='SAME'), b_c1)
conv1 = batch_norm(conv1, tf.constant(0.0, shape=[32]), tf.random_normal(shape=[32], mean=1.0, stddev=0.02), train_phase, scope='bn_1')
conv1 = tf.nn.relu(conv1)
conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
conv1 = tf.nn.dropout(conv1, keep_prob)
w_c2 = tf.Variable(w_alpha*tf.random_normal([5, 5, 32, 64]))
b_c2 = tf.Variable(b_alpha*tf.random_normal([64]))
conv2 = tf.nn.bias_add(tf.nn.conv2d(conv1, w_c2, strides=[1, 1, 1, 1], padding='SAME'), b_c2)
conv2 = batch_norm(conv2, tf.constant(0.0, shape=[64]), tf.random_normal(shape=[64], mean=1.0, stddev=0.02), train_phase, scope='bn_2')
conv2 = tf.nn.relu(conv2)
conv2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
conv2 = tf.nn.dropout(conv2, keep_prob)
w_c3 = tf.Variable(w_alpha*tf.random_normal([3, 3, 64, 64]))
b_c3 = tf.Variable(b_alpha*tf.random_normal([64]))
conv3 = tf.nn.bias_add(tf.nn.conv2d(conv2, w_c3, strides=[1, 1, 1, 1], padding='SAME'), b_c3)
conv3 = batch_norm(conv3, tf.constant(0.0, shape=[64]), tf.random_normal(shape=[64], mean=1.0, stddev=0.02), train_phase, scope='bn_3')
conv3 = tf.nn.relu(conv3)
conv3 = tf.nn.max_pool(conv3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
conv3 = tf.nn.dropout(conv3, keep_prob)
w_c4 = tf.Variable(w_alpha*tf.random_normal([3, 3, 64, 64]))
b_c4 = tf.Variable(b_alpha*tf.random_normal([64]))
conv4 = tf.nn.bias_add(tf.nn.conv2d(conv3, w_c4, strides=[1, 1, 1, 1], padding='SAME'), b_c4)
conv4 = batch_norm(conv4, tf.constant(0.0, shape=[64]), tf.random_normal(shape=[64], mean=1.0, stddev=0.02), train_phase, scope='bn_4')
conv4 = tf.nn.relu(conv4)
conv4 = tf.nn.max_pool(conv4, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
conv4 = tf.nn.dropout(conv4, keep_prob)
# Fully connected layer
w_d = tf.Variable(w_alpha*tf.random_normal([2*16*64, 1024]))
b_d = tf.Variable(b_alpha*tf.random_normal([1024]))
dense = tf.reshape(conv4, [-1, w_d.get_shape().as_list()[0]])
dense = tf.nn.relu(tf.add(tf.matmul(dense, w_d), b_d))
dense = tf.nn.dropout(dense, keep_prob)
w_out = tf.Variable(w_alpha*tf.random_normal([1024, MAX_CAPTCHA*CHAR_SET_LEN]))
b_out = tf.Variable(b_alpha*tf.random_normal([MAX_CAPTCHA*CHAR_SET_LEN]))
out = tf.add(tf.matmul(dense, w_out), b_out)
return out
</code>Training loop – The network is trained with sigmoid cross‑entropy loss, Adam optimizer, and periodic accuracy checks; training stops once accuracy exceeds 80 % and the model is saved.
<code># 训练
def train_crack_captcha_cnn():
output = crack_captcha_cnn()
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=output, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=0.002).minimize(loss)
predict = tf.reshape(output, [-1, MAX_CAPTCHA, CHAR_SET_LEN])
max_idx_p = tf.argmax(predict, 2)
max_idx_l = tf.argmax(tf.reshape(Y, [-1, MAX_CAPTCHA, CHAR_SET_LEN]), 2)
correct_pred = tf.equal(max_idx_p, max_idx_l)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
step = 0
while True:
batch_x, batch_y = get_next_batch(64)
_, loss_ = sess.run([optimizer, loss], feed_dict={X: batch_x, Y: batch_y, keep_prob: 0.75, train_phase:True})
print(step, loss_)
if step % 100 == 0 and step != 0:
batch_x_test, batch_y_test = get_next_batch(100)
acc = sess.run(accuracy, feed_dict={X: batch_x_test, Y: batch_y_test, keep_prob: 1., train_phase:False})
print "第%s步,训练准确率为:%s" % (step, acc)
if acc > 0.8:
saver.save(sess, "crack_capcha.model", global_step=step)
break
step += 1
</code>The author reports that after roughly 500 training iterations the model reaches an accuracy of 84.3 %, and notes that reducing the image size from 64×512 to 32×256 was crucial for convergence.
Future work includes extending the approach to variable‑length Chinese character strings by employing an LSTM‑CTC architecture.
Python Programming Learning Circle
A global community of Chinese Python developers offering technical articles, columns, original video tutorials, and problem sets. Topics include web full‑stack development, web scraping, data analysis, natural language processing, image processing, machine learning, automated testing, DevOps automation, and big data.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.