qqwweee · bluesy7585 · Mar 7, 2019 · Mar 22, 2019 · Mar 22, 2019 · Mar 22, 2019
diff --git a/convert_voc_annotation.py b/convert_voc_annotation.py
@@ -0,0 +1,66 @@
+import os
+import argparse
+import xml.etree.ElementTree as ET
+
+#classes = ["bus"]
+classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
+
+def scan_xml_files(dir):
+    files = []
+    for file in os.listdir(dir):
+        if file.endswith(".xml") or file.endswith(".XML"):
+            files.append(os.path.join(dir, file))
+    return files
+
+def convert_annotation(file):#,list_file):
+    in_file = open(file)
+    tree=ET.parse(in_file)
+    root = tree.getroot()
+    write_lines = []
+
+    for obj in root.iter('object'):
+        difficult = obj.find('difficult').text
+        cls = obj.find('name').text
+        if cls not in classes or int(difficult)==1:
+            continue
+        cls_id = classes.index(cls)
+        xmlbox = obj.find('bndbox')
+        b = (int(xmlbox.find('xmin').text), int(xmlbox.find('ymin').text), int(xmlbox.find('xmax').text), int(xmlbox.find('ymax').text))
+        write_line = " " + ",".join([str(a) for a in b]) + ',' + str(cls_id)
+        write_lines.append(write_line)
+        #list_file.write()
+
+    return write_lines
+
+
+def create_train_txt(input, imgdir):
+
+    files = scan_xml_files(input)
+    lines_to_write = []
+
+    for f in files:
+        write_lines = convert_annotation(f)
+        if len(write_lines) == 0:
+            continue
+
+        basename = os.path.basename(f)
+        img_file = basename.replace('xml','jpg')
+        fullpath = os.path.join(imgdir, img_file)
+
+        write_lines_join = ''.join(write_lines)
+        write_line = '{}{}'.format(fullpath,write_lines_join)
+        lines_to_write.append(write_line)
+
+
+    full_text = '\n'.join(lines_to_write)
+    with open('train.txt', 'w+') as the_file:
+        the_file.write(full_text)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument( "-t", dest="input_dir", action="store", type=str, required=False, 
+                  help="directory of annotations", default='./image/' )
+parser.add_argument( "-i", dest="image_dir", action="store", type=str, required=True,
+                  help="directory of image files")
+args = parser.parse_args()
+create_train_txt(args.input_dir, args.image_dir)
diff --git a/yolo.py b/yolo.py
@@ -99,7 +99,7 @@ def generate(self):
                 score_threshold=self.score, iou_threshold=self.iou)
         return boxes, scores, classes
 
-    def detect_image(self, image):
+    def detect_image(self, image, single_image=True, output=list()):
         start = timer()
 
         if self.model_image_size != (None, None):
@@ -112,7 +112,7 @@ def detect_image(self, image):
             boxed_image = letterbox_image(image, new_image_size)
         image_data = np.array(boxed_image, dtype='float32')
 
-        print(image_data.shape)
+        if single_image:print(image_data.shape)
         image_data /= 255.
         image_data = np.expand_dims(image_data, 0)  # Add batch dimension.
 
@@ -124,7 +124,7 @@ def detect_image(self, image):
                 K.learning_phase(): 0
             })
 
-        print('Found {} boxes for {}'.format(len(out_boxes), 'img'))
+        if single_image:print('Found {} boxes for {}'.format(len(out_boxes), 'img'))
 
         font = ImageFont.truetype(font='font/FiraMono-Medium.otf',
                     size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))
@@ -144,7 +144,11 @@ def detect_image(self, image):
             left = max(0, np.floor(left + 0.5).astype('int32'))
             bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32'))
             right = min(image.size[0], np.floor(right + 0.5).astype('int32'))
-            print(label, (left, top), (right, bottom))
+            if single_image:
+                print(label, (left, top), (right, bottom))
+            else:
+                output_str='{} {} {} {} {}'.format(label, left, top, right, bottom)
+                output.append(output_str)
 
             if top - label_size[1] >= 0:
                 text_origin = np.array([left, top - label_size[1]])
@@ -163,7 +167,7 @@ def detect_image(self, image):
             del draw
 
         end = timer()
-        print(end - start)
+        if single_image:print(end - start)
         return image
 
     def close_session(self):

diff --git a/yolo3/model.py b/yolo3/model.py
@@ -193,7 +193,7 @@ def yolo_eval(yolo_outputs,
               iou_threshold=.5):
     """Evaluate YOLO model on given input and return filtered boxes."""
     num_layers = len(yolo_outputs)
-    anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]] # default setting
+    anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [0,1,2]] # default setting
     input_shape = K.shape(yolo_outputs[0])[1:3] * 32
     boxes = []
     box_scores = []
@@ -247,7 +247,7 @@ def preprocess_true_boxes(true_boxes, input_shape, anchors, num_classes):
     '''
     assert (true_boxes[..., 4]<num_classes).all(), 'class id must be less than num_classes'
     num_layers = len(anchors)//3 # default setting
-    anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]]
+    anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [0,1,2]]
 
     true_boxes = np.array(true_boxes, dtype='float32')
     input_shape = np.array(input_shape, dtype='int32')
@@ -361,7 +361,7 @@ def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False):
     num_layers = len(anchors)//3 # default setting
     yolo_outputs = args[:num_layers]
     y_true = args[num_layers:]
-    anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]]
+    anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [0,1,2]]
     input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0]))
     grid_shapes = [K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers)]
     loss = 0

diff --git a/yolo3/utils.py b/yolo3/utils.py
@@ -4,11 +4,10 @@
 
 from PIL import Image
 import numpy as np
-from matplotlib.colors import rgb_to_hsv, hsv_to_rgb
+import cv2
 
 def compose(*funcs):
     """Compose arbitrarily many functions, evaluated left to right.
-
     Reference: https://mathieularose.com/function-composition-in-python/
     """
     # return lambda x: reduce(lambda v, f: f(v), funcs, x)
@@ -36,8 +35,11 @@ def rand(a=0, b=1):
 def get_random_data(annotation_line, input_shape, random=True, max_boxes=20, jitter=.3, hue=.1, sat=1.5, val=1.5, proc_img=True):
     '''random preprocessing for real-time data augmentation'''
     line = annotation_line.split()
-    image = Image.open(line[0])
-    iw, ih = image.size
+
+    # numpy array: BGR, 0-255
+    image = cv2.imread(line[0])
+    # height, width, channel
+    ih, iw, _ = image.shape
     h, w = input_shape
     box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
 
@@ -50,9 +52,13 @@ def get_random_data(annotation_line, input_shape, random=True, max_boxes=20, jit
         dy = (h-nh)//2
         image_data=0
         if proc_img:
-            image = image.resize((nw,nh), Image.BICUBIC)
+            # resize
+            image = cv2.resize(image, (nw, nh), interpolation=cv2.INTER_AREA)
+            # convert into PIL Image object
+            image = Image.fromarray(image[:, :, ::-1])
             new_image = Image.new('RGB', (w,h), (128,128,128))
             new_image.paste(image, (dx, dy))
+            # convert into numpy array: RGB, 0-1
             image_data = np.array(new_image)/255.
 
         # correct boxes
@@ -75,40 +81,84 @@ def get_random_data(annotation_line, input_shape, random=True, max_boxes=20, jit
     else:
         nw = int(scale*w)
         nh = int(nw/new_ar)
-    image = image.resize((nw,nh), Image.BICUBIC)
+
+    # resize
+    image = cv2.resize(image, (nw, nh), interpolation=cv2.INTER_AREA)
+    # convert into PIL Image object
+    image = Image.fromarray(image[:, :, ::-1])
 
     # place image
     dx = int(rand(0, w-nw))
     dy = int(rand(0, h-nh))
     new_image = Image.new('RGB', (w,h), (128,128,128))
     new_image.paste(image, (dx, dy))
-    image = new_image
-
-    # flip image or not
-    flip = rand()<.5
-    if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)
+    # convert into numpy array: BGR, 0-255
+    image = np.asarray(new_image)[:, :, ::-1]
+
+    # horizontal flip (faster than cv2.flip())
+    h_flip = rand() < 0.5
+    if h_flip:
+        image = image[:, ::-1]
+
+    # vertical flip
+    v_flip = False#rand() < 0.5
+    if v_flip:
+        image = image[::-1]
+
+    # rotation augment
+    is_rot = False
+    if is_rot:
+        right = rand() < 0.5
+        if right:
+            image = image.transpose(1, 0, 2)[:, ::-1]
+        else:
+            image = image.transpose(1, 0, 2)[::-1]
 
     # distort image
-    hue = rand(-hue, hue)
+    img_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+    H = img_hsv[:, :, 0].astype(np.float32)
+    S = img_hsv[:, :, 1].astype(np.float32)
+    V = img_hsv[:, :, 2].astype(np.float32)
+
+    hue = rand(-hue, hue) * 179
+    H += hue
+    np.clip(H, a_min=0, a_max=179, out=H)
+
     sat = rand(1, sat) if rand()<.5 else 1/rand(1, sat)
+    S *= sat
+    np.clip(S, a_min=0, a_max=255, out=S)
+
     val = rand(1, val) if rand()<.5 else 1/rand(1, val)
-    x = rgb_to_hsv(np.array(image)/255.)
-    x[..., 0] += hue
-    x[..., 0][x[..., 0]>1] -= 1
-    x[..., 0][x[..., 0]<0] += 1
-    x[..., 1] *= sat
-    x[..., 2] *= val
-    x[x>1] = 1
-    x[x<0] = 0
-    image_data = hsv_to_rgb(x) # numpy array, 0 to 1
+    V *= val
+    np.clip(V, a_min=0, a_max=255, out=V)
+
+    img_hsv[:, :, 0] = H.astype(np.uint8)
+    img_hsv[:, :, 1] = S.astype(np.uint8)
+    img_hsv[:, :, 2] = V.astype(np.uint8)
+
+    # convert into numpy array: RGB, 0-1
+    image_data = cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB) / 255.0
 
     # correct boxes
     box_data = np.zeros((max_boxes,5))
     if len(box)>0:
         np.random.shuffle(box)
         box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
         box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
-        if flip: box[:, [0,2]] = w - box[:, [2,0]]
+        if h_flip:
+            box[:, [0,2]] = w - box[:, [2,0]]
+        if v_flip:
+            box[:, [1,3]] = h - box[:, [3,1]]
+        if is_rot:
+            if right:
+                tmp = box[:, [0, 2]]
+                box[:, [0,2]] = h - box[:, [3,1]]
+                box[:, [1,3]] = tmp
+            else:
+                tmp = box[:, [2, 0]]
+                box[:, [0,2]] = box[:, [1,3]]
+                box[:, [1,3]] = w - tmp
+
         box[:, 0:2][box[:, 0:2]<0] = 0
         box[:, 2][box[:, 2]>w] = w
         box[:, 3][box[:, 3]>h] = h
@@ -118,4 +168,4 @@ def get_random_data(annotation_line, input_shape, random=True, max_boxes=20, jit
         if len(box)>max_boxes: box = box[:max_boxes]
         box_data[:len(box)] = box
 
-    return image_data, box_data
+    return image_data, box_data
diff --git a/yolo_video.py b/yolo_video.py
@@ -1,5 +1,6 @@
 import sys
 import argparse
+import os
 from yolo import YOLO, detect_video
 from PIL import Image
 
@@ -16,6 +17,42 @@ def detect_img(yolo):
             r_image.show()
     yolo.close_session()
 
+def get_image_files(dir):
+    imgs = []
+    for file in os.listdir(dir):
+        file_lower = file.lower()
+        if file_lower.endswith(".png") or file_lower.endswith(".jpg"):
+            imgs.append(os.path.join(dir, file))
+    return imgs
+
+def detect_imgdir(yolo, dir, output_txt=False):
+    img_files = get_image_files(dir)
+    save_dir = os.path.join(dir,'out')
+    if not os.path.exists(save_dir): os.makedirs(save_dir)
+    for img in img_files:
+        try:
+            image = Image.open(img)
+        except:
+            print('Open Error! {}'.format(img))
+            continue
+        else:
+            fullpath = os.path.join(save_dir, os.path.basename(img))
+            detections = list()
+            r_image = yolo.detect_image(image, single_image=False, output=detections)
+
+            if not output_txt:
+                r_image.save(fullpath,"JPEG")
+                print('save {}'.format(fullpath))
+            else:
+                basename = os.path.basename(img)  # eg. 123.jpg
+                txt_file = os.path.splitext(basename)[0]+'.txt'  # eg. 0001
+                txt_fullpath = os.path.join(save_dir, txt_file)
+                with open(txt_fullpath, 'w+') as the_file:
+                    full_text = '\n'.join(detections)
+                    the_file.write(full_text)
+
+    yolo.close_session()
+
 FLAGS = None
 
 if __name__ == '__main__':
@@ -25,17 +62,17 @@ def detect_img(yolo):
     Command line options
     '''
     parser.add_argument(
-        '--model', type=str,
+        '--model_path', type=str,
         help='path to model weight file, default ' + YOLO.get_defaults("model_path")
     )
 
     parser.add_argument(
-        '--anchors', type=str,
+        '--anchors_path', type=str,
         help='path to anchor definitions, default ' + YOLO.get_defaults("anchors_path")
     )
 
     parser.add_argument(
-        '--classes', type=str,
+        '--classes_path', type=str,
         help='path to class definitions, default ' + YOLO.get_defaults("classes_path")
     )
 
@@ -48,6 +85,16 @@ def detect_img(yolo):
         '--image', default=False, action="store_true",
         help='Image detection mode, will ignore all positional arguments'
     )
+
+    parser.add_argument(
+        '--imgdir', type=str, default='',
+        help='Image dir detection mode, will ignore all positional arguments'
+    )
+
+    parser.add_argument(
+        '--txt', default=False, action="store_true",
+        help='Image dir detection will output txt files'
+    )
     '''
     Command line positional arguments -- for video detection mode
     '''
@@ -71,6 +118,12 @@ def detect_img(yolo):
         if "input" in FLAGS:
             print(" Ignoring remaining command line arguments: " + FLAGS.input + "," + FLAGS.output)
         detect_img(YOLO(**vars(FLAGS)))
+
+    elif os.path.isdir(FLAGS.imgdir):
+        print("Image directory mode")
+        if "input" in FLAGS:
+            print(" Ignoring remaining command line arguments: " + FLAGS.input + "," + FLAGS.output)
+        detect_imgdir(YOLO(**vars(FLAGS)), FLAGS.imgdir, FLAGS.txt)
     elif "input" in FLAGS:
         detect_video(YOLO(**vars(FLAGS)), FLAGS.input, FLAGS.output)
     else: