-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_st.py
142 lines (111 loc) · 4.65 KB
/
main_st.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from user_input import main
from intake_camera_data import main as search
import streamlit as st
import os
import cv2
import requests
from PIL import Image
import groundingdino.datasets.transforms as T
from groundingdino.util.inference import load_model, load_image, predict, annotate
st.title("Welcome to the Vision Voyager Demo!")
input_type = st.radio('How would you like to describe your object?', ['Text', 'Image'])
st.session_state['input_type'] = input_type
def get_image_path(img):
# Create a directory and save the uploaded image.
file_path = f"data/uploadedImages/{img.name}"
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, "wb") as img_file:
img_file.write(img.getbuffer())
return file_path
if st.session_state['input_type'] == "Image":
uploaded_file = st.file_uploader("Upload your image here!")
if 'uploaded_file' not in st.session_state:
st.session_state['uploaded_file'] = uploaded_file
bytes_data = get_image_path(uploaded_file)
image = Image.frombytes('RGBA', (128,128), bytes_data)
st.session_state['object_description'] = image
if st.session_state['input_type'] == "Text":
object_description = st.text_input("Describe your object here!")
if 'object_description' not in st.session_state:
st.session_state['object_description'] = object_description
#item_description = main()
start_initialized = st.button('Start demo!')
#if start_initialized:
# search()
if start_initialized:
text_prompt = st.session_state['object_description']
HOME = os.getcwd()
# set model configuration file path
# CONFIG_PATH = os.path.join(HOME, "groundingdino/config/GroundingDINO_SwinT_OGC.py")
# set model weight file ath
WEIGHTS_PATH = 'weights/groundingdino_swint_ogc.pth'
# set text prompt
TEXT_PROMPT = text_prompt
# set box and text threshold values
BOX_TRESHOLD = 0.3
TEXT_TRESHOLD = 0.2
PREDICTION_THRESHOLD = 0.3
model = load_model("GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", "GroundingDINO/weights/groundingdino_swint_ogc.pth")
item_not_found = True
while item_not_found:
# ESP32 URL
URL = 'http://172.20.10.6'
AWB = True
# Face recognition and opencv setup
cap = cv2.VideoCapture(URL + ":81/stream")
requests.get(URL + "/control?var=framesize&val={}".format(10))
requests.get(URL + "/control?var=quality&val={}".format(10))
ret, frame = cap.read()
print('reading from stream')
# create a transform function by applying 3 image transaformations
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]
)
# convert frame to a PIL object in RGB space
image_source = Image.fromarray(frame).convert("RGB")
# convert the PIL image object to a transform object
image_transformed, _ = transform(image_source, None)
image_source.save('test.jpg')
st.image('test.jpg', 'Captured Image')
# predict boxes, logits, phrases
print(TEXT_PROMPT)
boxes, logits, phrases = predict(
model=model,
image=image_transformed,
caption=TEXT_PROMPT,
box_threshold=BOX_TRESHOLD,
text_threshold=TEXT_TRESHOLD,
device='cpu')
# annotate the image
annotated_frame = annotate(image_source=frame, boxes=boxes, logits=logits, phrases=phrases)
# display the output
out_frame = cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB)
cv2.imshow('frame', out_frame)
#save the output to JPG
cv2.imwrite('model_output.jpg', out_frame)
st.image('model_output.jpg', f'Identified Objects, Confidence: {logits}')
if cv2.waitKey(1) & 0xFF == ord('q'):
break
if logits.numel() == 0:
pass
else:
for value in logits.view(-1):
## TODO find highest value if not too lazy
if value > PREDICTION_THRESHOLD:
item_not_found = False
print("Item found!")
print("Item is: " + TEXT_PROMPT)
print("Confidence: " + str(value))
#print("Box: " + str(boxes))
print("Text: " + str(phrases))
cv2.imwrite('item_found.jpg', out_frame)
st.success('Item Found!')
break
if item_not_found == False:
break
cap.release()
cv2.destroyAllWindows()