Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

使用lightened_cnn.py训练 CASIA-WebFace ,accuracy 一直都小0.1,是为什么呢? #31

Open
adeagle opened this issue Nov 25, 2016 · 2 comments

Comments

@adeagle
Copy link

adeagle commented Nov 25, 2016

No description provided.

@tornadomeet
Copy link
Owner

i don't know, because no detail about your experiment.

@adeagle
Copy link
Author

adeagle commented Nov 27, 2016

一、数据集CASIA-webface

二、数据清洗方法:
https://github.com/happynear/FaceVerification
Good News: @潘泳苹果皮 and his colleagues have washed the CASIA-webface database manually. After washing, 27703 wrong images are deleted. The washed list can be downloaded from http://pan.baidu.com/s/1hrKpbm8

三、制作数据集
raw_data_path=/opt/mxnet/mxnet-face-master/verification/FullData
align_data_path=/opt/mxnet/mxnet-face-master/verification/FullData-align/
makelist_path=/opt/mxnet/mxnet/tools/make_list.py

the number threads used for align data, you shold change this depend on your environment

num_process=4
landmarks=innerEyesAndBottomLip
face_size=144
ts=0.1
list_name=casia
rec_name=casia
im2rec_path=/opt/mxnet/mxnet/bin/im2rec

step1:align the face iamge

if ! [ -e $align_data_path ];then
mkdir -p $align_data_path
for N in $(seq $num_process);do
echo "the sub-process is : $N"
python ../util/align_face.py $raw_data_path align $landmarks $align_data_path --ts $ts --size $face_size &
done
else
echo "$align_data_path already exist."
fi
wait
echo "Align face image done"

step2: generate .lst for im2rec

if ! [ -e ${list_name}image_train.lst ];then
python -u $makelist_path $align_data_path $list_name --train_ratio 0.95 --recursive True
else
echo ".lst file for training already exist."
fi
echo "generated .lst file done"

step3: use img2rec to generate .rec file for training

if ! [ -e ${rec_name}_train.rec ]; then
$im2rec_path ${list_name}image_train.lst $align_data_path ${rec_name}_train.rec color=0 encoding='.png' &
$im2rec_path ${list_name}image_val.lst $align_data_path ${rec_name}_val.rec color=0 encoding='.png' &
else
echo "$rec_name already exist."
fi
wait
echo "generate .rec done"
echo "trining done!"

四、数据集类数和训练样本数
ls FullData-align/ | wc
10575
cat casiaimage_train.lst | wc
413914

五、训练源码
lightened_cnn.py:

import argparse,logging
import mxnet as mx
import matplotlib.pyplot as plt
import numpy as np
import time
import threading
import Queue

logger = logging.getLogger()
logger.setLevel(logging.INFO)
ctx = mx.gpu(0)
msg=Queue.Queue(maxsize = 1000)

def group(data, num_r, num, kernel, stride, pad, layer):
if num_r > 0:
conv_r = mx.symbol.Convolution(data=data, num_filter=num_r, kernel=(1,1), name=('conv%s_r' % layer))
slice_r = mx.symbol.SliceChannel(data=conv_r, num_outputs=2, name=('slice%s_r' % layer))
mfm_r = mx.symbol.maximum(slice_r[0], slice_r[1])
conv = mx.symbol.Convolution(data=mfm_r, kernel=kernel, stride=stride, pad=pad, num_filter=num, name=('conv%s' % layer))
else:
conv = mx.symbol.Convolution(data=data, kernel=kernel, stride=stride, pad=pad, num_filter=num, name=('conv%s' % layer))
slice = mx.symbol.SliceChannel(data=conv, num_outputs=2, name=('slice%s' % layer))
mfm = mx.symbol.maximum(slice[0], slice[1])
pool = mx.symbol.Pooling(data=mfm, pool_type="max", kernel=(2, 2), stride=(2,2), name=('pool%s' % layer))
return pool

def lightened_cnn_a_feature():
data = mx.symbol.Variable(name="data")
pool1 = group(data, 0, 96, (9,9), (1,1), (0,0), str(1))
pool2 = group(pool1, 0, 192, (5,5), (1,1), (0,0), str(2))
pool3 = group(pool2, 0, 256, (5,5), (1,1), (0,0), str(3))
pool4 = group(pool3, 0, 384, (4,4), (1,1), (0,0), str(4))
flatten = mx.symbol.Flatten(data=pool4)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=512, name="fc1")
slice_fc1 = mx.symbol.SliceChannel(data=fc1, num_outputs=2, name="slice_fc1")
mfm_fc1 = mx.symbol.maximum(slice_fc1[0], slice_fc1[1])
drop1 = mx.symbol.Dropout(data=mfm_fc1, p=0.7, name="drop1")
return drop1

def lightened_cnn_a(num_classes=10575):
drop1 = lightened_cnn_a_feature()
fc2 = mx.symbol.FullyConnected(data=drop1, num_hidden=num_classes, name="fc2")
softmax = mx.symbol.SoftmaxOutput(data=fc2, name='softmax')
return softmax

def lightened_cnn_b_feature():
data = mx.symbol.Variable(name="data")
pool1 = group(data, 0, 96, (5,5), (1,1), (2,2), str(1))
pool2 = group(pool1, 96, 192, (3,3), (1,1), (1,1), str(2))
pool3 = group(pool2, 192, 384, (3,3), (1,1), (1,1), str(3))
pool4 = group(pool3, 384, 256, (3,3), (1,1), (1,1), str(4))
pool5 = group(pool4, 256, 256, (3,3), (1,1), (1,1), str(5))
flatten = mx.symbol.Flatten(data=pool5)
fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=512, name="fc1")
slice_fc1 = mx.symbol.SliceChannel(data=fc1, num_outputs=2, name="slice_fc1")
mfm_fc1 = mx.symbol.maximum(slice_fc1[0], slice_fc1[1])
drop1 = mx.symbol.Dropout(data=mfm_fc1, p=0.7, name="drop1")
return drop1

def lightened_cnn_b(num_classes=10575):
drop1 = lightened_cnn_b_feature()
fc2 = mx.symbol.FullyConnected(data=drop1, num_hidden=num_classes, name="fc2")
softmax = mx.symbol.SoftmaxOutput(data=fc2, name='softmax')
return softmax

class MySpeedometer(object):

def __init__(self, batch_size, frequent=50):
    self.batch_size = batch_size
    self.frequent = frequent
    self.init = False
    self.tic = 0
    self.last_count = 0
    self.data=[]
    self.epoch=0
            
def __call__(self, param):
    """Callback to Show speed."""
    count = param.nbatch
    if self.last_count > count:
        self.init = False
    self.last_count = count

    if self.init:
        if count % self.frequent == 0:
            speed = self.frequent * self.batch_size / (time.time() - self.tic)
            if param.eval_metric is not None:
                name_value = param.eval_metric.get_name_value()
                param.eval_metric.reset()
                for name, value in name_value:
                    #print name,"+",value
                    if name.find("accuracy")>=0:
                        if self.epoch!=param.epoch:
                            #msg.put(value)
                            self.epoch=param.epoch                        
                                                    
                    logging.info('Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-%s=%f',
                                 param.epoch, count, speed, name, value)
            else:
                logging.info("Iter[%d] Batch [%d]\tSpeed: %.2f samples/sec",
                             param.epoch, count, speed)
            self.tic = time.time()
    else:
        self.init = True
        self.tic = time.time()

def main():
# lightened_cnn = lightened_cnn_a()
lightened_cnn = lightened_cnn_b()
devs = mx.cpu() if args.gpus is None else [mx.gpu(int(i)) for i in args.gpus.split(',')]
epoch_size = args.num_examples / args.batch_size
checkpoint = mx.callback.do_checkpoint(args.model_save_prefix)
kv = mx.kvstore.create(args.kv_store)
arg_params = None
aux_params = None
if args.retrain:
_, arg_params, aux_params = mx.model.load_checkpoint(args.model_load_prefix, args.model_load_epoch)

train = mx.io.ImageRecordIter(
    path_imgrec = args.data_dir + "casia_train.rec",
    data_shape  = (1, 128, 128),
    scale       = 1./255,
    batch_size  = args.batch_size,
    rand_crop   = True,
    rand_mirror = True,
    num_parts   = kv.num_workers,
    part_index  = kv.rank)
if not args.retrain:
    val = mx.io.ImageRecordIter(
        path_imgrec = args.data_dir + "casia_val.rec",
        batch_size  = args.batch_size,
        data_shape  = (1, 128, 128),
        scale       = 1./255,
        rand_crop   = True,
        rand_mirror = False,
        num_parts   = kv.num_workers,
        part_index  = kv.rank)
else:
    val = None
model = mx.model.FeedForward(
    ctx                = devs,
    symbol             = lightened_cnn,
    arg_params         = arg_params,
    aux_params         = aux_params,
    num_epoch          = 200,
    learning_rate      = args.lr,
    momentum           = 0.9,
    wd                 = 0.0005,
    lr_scheduler       = mx.lr_scheduler.FactorScheduler(step=5*max(int(epoch_size * 1), 1), factor=0.8, stop_factor_lr=5e-5),
    initializer        = mx.init.Xavier(factor_type="in", magnitude=2.34))
model.fit(
    X                  = train,
    eval_data          = val,
    kvstore            = kv,
    batch_end_callback = MySpeedometer(args.batch_size, 100),
    epoch_end_callback = checkpoint)

if name == "main":
parser = argparse.ArgumentParser(description="command for training lightened-cnn")
parser.add_argument('--gpus', type=str, help='the gpus will be used, e.g "0,1,2,3"')
parser.add_argument('--data-dir', type=str, default='./', help='the input data directory')
parser.add_argument('--model-save-prefix', type=str, default='../model/lightened_cnn/lightened_cnn',
help='the prefix of the model to save')
parser.add_argument('--lr', type=float, default=0.05, help='initialization learning reate')
parser.add_argument('--batch-size', type=int, default=100, help='the batch size')
parser.add_argument('--num-examples', type=int, default=413914, help='the number of training examples')
parser.add_argument('--kv-store', type=str, default='local', help='the kvstore type')
parser.add_argument('--model-load-prefix', type=str, default='../model/lightened_cnn', help='the prefix of the model to load')
parser.add_argument('--model-load-epoch', type=int, default=1, help='load the model on an epoch using the model-load-prefix')
parser.add_argument('--retrain', action='store_true', default=False, help='true means continue training')
args = parser.parse_args()
logging.info(args)
t1 = threading.Thread(target=main)
t1.start()
t1.join()

plt.figure(figsize=(8, 6))    
data=[]    
while True:
    if msg.empty()==True:
        time.sleep(1)
    else:
        data.append(msg.get())
        #print data
        plt.xlabel("Epoch")
        plt.ylabel("Accuracy")
        plt.plot(range(len(data)), data, 'o', linestyle='-', color="r",label="Train accuracy")        
        plt.ylim([0,1])
        plt.draw()
        plt.waitforbuttonpress(timeout=0.001)

六、现象及问题:
1、我试过改lr 从0.008 - 0.1 如果是0.1 Train-Accuracy 会收敛在0.0001左右
如果在0.008-0.05 之间,会收敛在0.75左右
2、我的问题是,这个数据集用 lightened cnn 能否收敛到90%以上呢?

@adeagle adeagle closed this as completed Nov 27, 2016
@adeagle adeagle reopened this Nov 27, 2016
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants