728x90
반응형

개요

여러 사이트를 돌아다녀보면 아래와 같이 흔히 찾아 볼 수 있는 이미지 캡차 (CAPTCHA)를 인공지능 모델로 인식해보는 실험을 하려고한다.

※ CAPTCHA는 HIP 기술의 일종으로, 어떠한 사용자가 실제 사람인지 컴퓨터 프로그램인지를 구별하기 위해 사용되는 방법이다. 

아래 URL을 참고하여 구현하였음을 명시한다.

구현

import os
import cv2
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

난수 제어

seed를 같게 설정하여 같은 난수가 순서대로 발생하도록 설정해준다.

1111 이 아니더라도 임의의 값을 설정하면 된다.

seed = 1111
np.random.seed(seed)
tf.random.set_seed(seed)

디렉토리 정의

기본이 되는 루트 디렉토리와 데이터셋 디렉토리를 정의한다.

필자는 주피터 노트북을 통하여 실험을 하였으므로 현재 경로의 os.getcwd() 함수를 사용했다.

BASE_DIR = os.getcwd()
DATASET_DIR = os.path.join(BASE_DIR, 'datasets', 'captcha_images_v2')

데이터셋 다운로드

해당 URL에서 데이터셋을 다운받아 압축을 풀고 푼 후에는 압축 파일을 제거해주도록 한다.

if not os.path.isdir(DATASET_DIR):
    path_to_downloaded_file = tf.keras.utils.get_file(
        "captcha_images_v2.zip",
        "https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip",    
        cache_dir=os.getcwd(),
        archive_format=['zip'],
        extract=True)
    zip_filepath = os.path.join(BASE_DIR, 'datasets', 'captcha_images_v2.zip')
    os.remove(zip_filepath)

Downloading data from https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip
9076736/9075967 [==============================] - 1s 0us/step    

원시 데이터 시각화

다운받은 데이터셋에서 4장 정도의 이미지를 matpoltlib 패키지를 활용하여 시각해보자

data_dir = Path(DATASET_DIR)

images = list(data_dir.glob("*.png"))
sample_images = images[:4]

_,ax = plt.subplots(2,2, figsize=(8,3))
for i in range(4):
    img = cv2.imread(str(sample_images[i]))
    print("Shape of image: ", img.shape)
    ax[i//2, i%2].imshow(img)
    ax[i//2, i%2].axis('off')

Shape of image:  (50, 200, 3)
Shape of image:  (50, 200, 3)
Shape of image:  (50, 200, 3)
Shape of image:  (50, 200, 3)

데이터 전처리

중복되지 않은 문자 집합 (characters)을 추출해내고 이미지와 레이블을 pandas의 DataFrame으로 만들도록 하자

모델 훈련시 오버피팅이 발생하지 않기 위하여 frac (혼합 비율) 옵션을 설정하여 데이터셋을 잘 섞도록 한다.

characters = set()
captcha_length = []
dataset = []

for img_path in images:
    label = img_path.name.split(".png")[0]
    captcha_length.append(len(label))
    dataset.append((str(img_path), label))
    for ch in label:
        characters.add(ch)

characters = sorted(characters)
dataset = pd.DataFrame(dataset, columns=["img_path", "label"], index=None)
dataset = dataset.sample(frac=1.).reset_index(drop=True)
dataset.head()

img_path label
0 D:\captcha_recognition\datas... 6xxdx
1 D:\captcha_recognition\datas... e4gd7
2 D:\captcha_recognition\datas... dn26n
3 D:\captcha_recognition\datas... 42dw4
4 D:\captcha_recognition\datas... 3nnpw

'>>> charsets: {}'.format(characters)

">>> charsets: ['2', '3', '4', '5', '6', '7', '8', 'b', 'c', 'd', 'e', 'f', 'g', 'm', 'n', 'p', 'w', 'x', 'y']"

 

karas의 train_test_split 함수를 활용하여 훈련 데이터: 90%, 검증 데이터: 10% 를 나누어 주도록 한다.

training_data, validation_data = train_test_split(dataset, test_size=0.1, random_state=seed)

training_data = training_data.reset_index(drop=True)
validation_data = validation_data.reset_index(drop=True)

char_to_labels = {char:idx for idx, char in enumerate(characters)}
labels_to_char = {val:key for key, val in char_to_labels.items()}

print('>>> char_to_labels: {}'.format(char_to_labels))
print('>>> labels_to_char: {}'.format(labels_to_char))

>>> char_to_labels: {'2': 0, '3': 1, '4': 2, '5': 3, '6': 4, '7': 5, '8': 6, 'b': 7, 'c': 8, 'd': 9, 'e': 10, 'f': 11, 'g': 12, 'm': 13, 'n': 14, 'p': 15, 'w': 16, 'x': 17, 'y': 18}
>>> labels_to_char: {0: '2', 1: '3', 2: '4', 3: '5', 4: '6', 5: '7', 6: '8', 7: 'b', 8: 'c', 9: 'd', 10: 'e', 11: 'f', 12: 'g', 13: 'm', 14: 'n', 15: 'p', 16: 'w', 17: 'x', 18: 'y'}

 

실제 캡차 문자인지 판별해주는 함수이다.

def is_valid_captcha(captcha):
    for ch in captcha:
        if not ch in characters:
            return False
    return True

pandas로 만든 DataFrame 데이터를 다시 numpy 데이터 형태로 변환하여 보자

 

def generate_arrays(df, resize=True, img_height=50, img_width=200):    
    num_items = len(df)
    images = np.zeros((num_items, img_height, img_width), dtype=np.float32)
    labels = [0]*num_items
    
    for i in range(num_items):
        img = cv2.imread(df["img_path"][i])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        if resize: 
            img = cv2.resize(img, (img_width, img_height))
        
        img = (img/255.).astype(np.float32)
        label = df["label"][i]
        
        if is_valid_captcha(label):
            images[i, :, :] = img
            labels[i] = label
    
    return images, np.array(labels)
training_data, training_labels = generate_arrays(df=training_data)
validation_data, validation_labels = generate_arrays(df=validation_data)

데이터 제너레이터

 

실제 학습시에는 LSTM으로 순차적으로 학습을 하므로 전치 행렬을 사용하여 회전을 시켜주어야 한다.

 

plt.imshow(training_data[0].T)
training_data[0].shape, training_labels[0], np.expand_dims(training_data[0].T, axis=-1).shape

((50, 200), 'b5fm7', (200, 50, 1))

class DataGenerator(keras.utils.Sequence):

    
    def __init__(self,
                 data,
                 labels,
                 char_map,
                 batch_size=16,
                 img_width=200,
                 img_height=50,
                 downsample_factor=4,
                 max_length=5,
                 shuffle=True
                ):
        self.data = data
        self.labels = labels
        self.char_map = char_map
        self.batch_size = batch_size
        self.img_width = img_width
        self.img_height = img_height
        self.downsample_factor = downsample_factor
        self.max_length = max_length
        self.shuffle = shuffle
        self.indices = np.arange(len(data))    
        self.on_epoch_end()
        
    def __len__(self):
        return int(np.ceil(len(self.data) / self.batch_size))
    
    def __getitem__(self, idx):
        # 1. Get the next batch indices
        curr_batch_idx = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
        
        # 2. This isn't necessary but it can help us save some memory
        # as not all batches the last batch may not have elements
        # equal to the batch_size 
        batch_len = len(curr_batch_idx)
        
        # 3. Instantiate batch arrays
        batch_images = np.ones((batch_len, self.img_width, self.img_height, 1),
                               dtype=np.float32)
        batch_labels = np.ones((batch_len, self.max_length), dtype=np.float32)
        input_length = np.ones((batch_len, 1), dtype=np.int64) * \
                                (self.img_width // self.downsample_factor - 2)
        label_length = np.zeros((batch_len, 1), dtype=np.int64)
        
        
        for j, idx in enumerate(curr_batch_idx):
            # 1. Get the image and transpose it
            img = self.data[idx].T
            # 2. Add extra dimenison
            img = np.expand_dims(img, axis=-1)
            # 3. Get the correpsonding label
            text = self.labels[idx]
            # 4. Include the pair only if the captcha is valid
            if is_valid_captcha(text):
                label = [self.char_map[ch] for ch in text]
                batch_images[j] = img
                batch_labels[j] = label
                label_length[j] = len(text)
        
        batch_inputs = {
                'input_data': batch_images,
                'input_label': batch_labels,
                'input_length': input_length,
                'label_length': label_length,
                }
        return batch_inputs, np.zeros(batch_len).astype(np.float32)
        
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
# Batch size for training and validation
batch_size = 16

# Desired image dimensions
img_width=200
img_height=50 

# Factor  by which the image is going to be downsampled
# by the convolutional blocks
downsample_factor=4

# Maximum length of any captcha in the data
max_length=5

# Get a generator object for the training data
train_data_generator = DataGenerator(data=training_data,
                                     labels=training_labels,
                                     char_map=char_to_labels,
                                     batch_size=batch_size,
                                     img_width=img_width,
                                     img_height=img_height,
                                     downsample_factor=downsample_factor,
                                     max_length=max_length,
                                     shuffle=True
                                    )

# Get a generator object for the validation data 
valid_data_generator = DataGenerator(data=validation_data,
                                     labels=validation_labels,
                                     char_map=char_to_labels,
                                     batch_size=batch_size,
                                     img_width=img_width,
                                     img_height=img_height,
                                     downsample_factor=downsample_factor,
                                     max_length=max_length,
                                     shuffle=False
                                    )

CTC 손실 함수

class CTCLayer(layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred, input_length, label_length):
        # Compute the training-time loss value and add it
        # to the layer using `self.add_loss()`.
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)
        
        # On test time, just return the computed loss
        return loss

모델 생성

def build_model():
    # Inputs to the model
    input_img = layers.Input(shape=(img_width, img_height, 1),
                            name='input_data',
                            dtype='float32')
    labels = layers.Input(name='input_label', shape=[max_length], dtype='float32')
    input_length = layers.Input(name='input_length', shape=[1], dtype='int64')
    label_length = layers.Input(name='label_length', shape=[1], dtype='int64')
    
    # First conv block
    x = layers.Conv2D(32,
               (3,3),
               activation='relu',
               kernel_initializer='he_normal',
               padding='same',
               name='Conv1')(input_img)
    x = layers.MaxPooling2D((2,2), name='pool1')(x)
    
    # Second conv block
    x = layers.Conv2D(64,
               (3,3),
               activation='relu',
               kernel_initializer='he_normal',
               padding='same',
               name='Conv2')(x)
    x = layers.MaxPooling2D((2,2), name='pool2')(x)
    
    # We have used two max pool with pool size and strides of 2.
    # Hence, downsampled feature maps are 4x smaller. The number of
    # filters in the last layer is 64. Reshape accordingly before
    # passing it to RNNs
    new_shape = ((img_width // 4), (img_height // 4)*64)
    x = layers.Reshape(target_shape=new_shape, name='reshape')(x)
    x = layers.Dense(64, activation='relu', name='dense1')(x)
    x = layers.Dropout(0.2)(x)
    
    # RNNs
    x = layers.Bidirectional(layers.LSTM(128,
                                         return_sequences=True,
                                         dropout=0.2))(x)
    x = layers.Bidirectional(layers.LSTM(64,
                                         return_sequences=True,
                                         dropout=0.25))(x)
    
    # Predictions
    x = layers.Dense(len(characters)+1,
              activation='softmax', 
              name='dense2',
              kernel_initializer='he_normal')(x)
    
    # Calculate CTC
    output = CTCLayer(name='ctc_loss')(labels, x, input_length, label_length)
    
    # Define the model
    model = keras.models.Model(inputs=[input_img,
                                       labels,
                                       input_length,
                                       label_length],
                                outputs=output,
                                name='ocr_model_v1')
    
    # Optimizer
    sgd = keras.optimizers.SGD(learning_rate=0.002,
                               decay=1e-6,
                               momentum=0.9,
                               nesterov=True,
                               clipnorm=5)
    
    # Compile the model and return 
    model.compile(optimizer=sgd,metrics=['accuracy'])
    return model

Model: "ocr_model_v1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_data (InputLayer)         [(None, 200, 50, 1)] 0                                            
__________________________________________________________________________________________________
Conv1 (Conv2D)                  (None, 200, 50, 32)  320         input_data[0][0]                 
__________________________________________________________________________________________________
pool1 (MaxPooling2D)            (None, 100, 25, 32)  0           Conv1[0][0]                      
__________________________________________________________________________________________________
Conv2 (Conv2D)                  (None, 100, 25, 64)  18496       pool1[0][0]                      
__________________________________________________________________________________________________
pool2 (MaxPooling2D)            (None, 50, 12, 64)   0           Conv2[0][0]                      
__________________________________________________________________________________________________
reshape (Reshape)               (None, 50, 768)      0           pool2[0][0]                      
__________________________________________________________________________________________________
dense1 (Dense)                  (None, 50, 64)       49216       reshape[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 50, 64)       0           dense1[0][0]                     
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 50, 256)      197632      dropout[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 50, 128)      164352      bidirectional[0][0]              
__________________________________________________________________________________________________
input_label (InputLayer)        [(None, 5)]          0                                            
__________________________________________________________________________________________________
dense2 (Dense)                  (None, 50, 20)       2580        bidirectional_1[0][0]            
__________________________________________________________________________________________________
input_length (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
label_length (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
ctc_loss (CTCLayer)             (None, 1)            0           input_label[0][0]                
                                                                 dense2[0][0]                     
                                                                 input_length[0][0]               
                                                                 label_length[0][0]               
==================================================================================================
Total params: 432,596
Trainable params: 432,596
Non-trainable params: 0
__________________________________________________________________________________________________

model = build_model()
model.summary()

모델 가중치 로드

기존에 학습된 가중치 파일이 있을 경우 로드시켜준다.

checkpoint_save_path = os.path.join(BASE_DIR, 'checkpoint', 'BaseCNN.ckpt')
if os.path.exists(checkpoint_save_path + '.index'):
    print('-------------load the model-----------------')
    model.load_weights(checkpoint_save_path)

모델 학습

# Add early stopping
es = keras.callbacks.EarlyStopping(monitor='val_loss',
                                   patience=5,
                                   restore_best_weights=True)

# Train the model
history = model.fit(train_data_generator,
                    validation_data=valid_data_generator,
                    epochs=50,
                    callbacks=[es])

Epoch 1/50
59/59 [==============================] - 13s 220ms/step - loss: 24.1759 - accuracy: 0.0000e+00 - val_loss: 16.4636 - val_accuracy: 0.0000e+00
Epoch 2/50
59/59 [==============================] - 10s 171ms/step - loss: 16.4264 - accuracy: 0.0000e+00 - val_loss: 16.3060 - val_accuracy: 0.0000e+00
Epoch 3/50
59/59 [==============================] - 10s 170ms/step - loss: 16.3786 - accuracy: 0.0000e+00 - val_loss: 16.3366 - val_accuracy: 0.0000e+00
Epoch 4/50
59/59 [==============================] - 10s 174ms/step - loss: 16.2760 - accuracy: 0.0000e+00 - val_loss: 16.0451 - val_accuracy: 0.0000e+00
Epoch 5/50
59/59 [==============================] - 10s 171ms/step - loss: 16.0664 - accuracy: 0.0000e+00 - val_loss: 15.9370 - val_accuracy: 0.0000e+00
...
Epoch 45/50
59/59 [==============================] - 7s 122ms/step - loss: 0.1582 - accuracy: 0.9380 - val_loss: 0.0377 - val_accuracy: 0.9808
Epoch 46/50
59/59 [==============================] - 7s 123ms/step - loss: 0.1585 - accuracy: 0.9423 - val_loss: 0.0396 - val_accuracy: 0.9712
Epoch 47/50
59/59 [==============================] - 7s 121ms/step - loss: 0.1485 - accuracy: 0.9476 - val_loss: 0.0948 - val_accuracy: 0.9615
Epoch 48/50
59/59 [==============================] - 7s 121ms/step - loss: 0.1382 - accuracy: 0.9626 - val_loss: 0.0284 - val_accuracy: 0.9808
Epoch 49/50
59/59 [==============================] - 7s 121ms/step - loss: 0.0908 - accuracy: 0.9690 - val_loss: 0.0470 - val_accuracy: 0.9808
Epoch 50/50
59/59 [==============================] - 7s 122ms/step - loss: 0.0879 - accuracy: 0.9679 - val_loss: 0.2272 - val_accuracy: 0.9519

모델 검증

prediction_model = keras.models.Model(model.get_layer(name='input_data').input,
                                        model.get_layer(name='dense2').output)
prediction_model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_data (InputLayer)      [(None, 200, 50, 1)]      0         
_________________________________________________________________
Conv1 (Conv2D)               (None, 200, 50, 32)       320       
_________________________________________________________________
pool1 (MaxPooling2D)         (None, 100, 25, 32)       0         
_________________________________________________________________
Conv2 (Conv2D)               (None, 100, 25, 64)       18496     
_________________________________________________________________
pool2 (MaxPooling2D)         (None, 50, 12, 64)        0         
_________________________________________________________________
reshape (Reshape)            (None, 50, 768)           0         
_________________________________________________________________
dense1 (Dense)               (None, 50, 64)            49216     
_________________________________________________________________
dropout (Dropout)            (None, 50, 64)            0         
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 256)           197632    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 128)           164352    
_________________________________________________________________
dense2 (Dense)               (None, 50, 20)            2580      
=================================================================
Total params: 432,596
Trainable params: 432,596
Non-trainable params: 0
_________________________________________________________________

# A utility to decode the output of the network
def decode_batch_predictions(pred):
    pred = pred[:, :-2]
    input_len = np.ones(pred.shape[0])*pred.shape[1]
    
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(pred, 
                                        input_length=input_len,
                                        greedy=True)[0][0]
    
    # Iterate over the results and get back the text
    output_text = []
    for res in results.numpy():
        outstr = ''
        for c in res:
            if c < len(characters) and c >=0:
                outstr += labels_to_char[c]
        output_text.append(outstr)
    
    # return final text results
    return output_text

예측

#  Let's check results on some validation samples
for p, (inp_value, _) in enumerate(valid_data_generator):
    bs = inp_value['input_data'].shape[0]
    X_data = inp_value['input_data']
    labels = inp_value['input_label']
    
    preds = prediction_model.predict(X_data)
    pred_texts = decode_batch_predictions(preds)    
    
    orig_texts = []
    for label in labels:
        text = ''.join([labels_to_char[int(x)] for x in label])
        orig_texts.append(text)
        
    for i in range(bs):
        print(f'실제값: {orig_texts[i]} \t 예측값: {pred_texts[i]} \t 성공: {orig_texts[i] == pred_texts[i]}')
    break

실제값: dw8d3   예측값: dw8d3   성공: True
실제값: f85y3   예측값: f85y3   성공: True
실제값: b4d7c   예측값: b4d7c   성공: True
실제값: 7dyww   예측값: 7dyww   성공: True
실제값: n7enn   예측값: n7enn   성공: True
실제값: m8m4x   예측값: n8m4x   성공: False
실제값: xe8xm   예측값: xe8xm   성공: True
실제값: c3n8x   예측값: c3n8x   성공: True
실제값: 72m6f   예측값: 72m6f   성공: True
실제값: 2mpnn   예측값: 2mpnn   성공: True
실제값: w8f36   예측값: w8f36   성공: True
실제값: dbny3   예측값: dbny3   성공: True
실제값: bgd4m   예측값: bgd4m   성공: True
실제값: yemy4   예측값: yemy4   성공: True
실제값: bmxpe   예측값: bmxpe   성공: True
실제값: nxc83   예측값: nxc83   성공: True

학습률

# Show acc and Loss curves for training and validation sets
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.subplot(1, 2, 1)
plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

728x90
반응형

'AI 인공지능 > Research' 카테고리의 다른 글

월리를 찾아라  (0) 2022.03.01
마스크 착용을 인식할 수 있을까?  (0) 2022.02.22
바코드, QR 코드 인식할 수 있을까?  (0) 2022.02.16
지문을 인식할 수 있을까?  (0) 2022.02.15