-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathYolo8s_with_audio_activation.py
116 lines (98 loc) · 4.24 KB
/
Yolo8s_with_audio_activation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from ultralytics import YOLO
import cv2
import cvzone
import math
import speech_recognition as sr
from gtts import gTTS
from io import BytesIO
import pygame
# Load YOLO model
model = YOLO("../Yolo-Weights/yolov8s")
# Object detection class names
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa",
"pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard",
"cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
"scissors", "teddy bear", "hair drier", "toothbrush"]
# Function to convert text to speech
def text_to_audio(text, language='en'):
tts = gTTS(text=text, lang=language, slow=False)
audio_data = BytesIO()
tts.write_to_fp(audio_data)
audio_data.seek(0)
pygame.mixer.init()
pygame.mixer.music.load(audio_data, 'mp3')
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
continue
# Function to listen for voice commands
def listen_for_command():
recognizer = sr.Recognizer()
with sr.Microphone() as source:
print("Listening for a command...")
try:
audio = recognizer.listen(source)
command = recognizer.recognize_google(audio).lower()
return command
except sr.UnknownValueError:
print("Could not understand audio.")
return None
except sr.RequestError:
print("Network error.")
return None
# Object detection function
def object_detection():
cap = cv2.VideoCapture(0)
while True:
success, img = cap.read()
results = model(img, stream=True)
detected_objects = []
for r in results:
boxes = r.boxes
for box in boxes:
x1, y1, x2, y2 = box.xyxy[0]
x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
w, h = x2 - x1, y2 - y1
cvzone.cornerRect(img, (x1, y1, w, h))
conf = math.ceil((box.conf[0] * 100)) / 100
cls = int(box.cls[0])
currentClass = classNames[cls]
# Closeness estimation based on confidence threshold and bounding box size
if conf > 0.5:
closeness = "closer" if w * h > 50000 else "farther"
detected_objects.append(f"{currentClass} is {closeness}")
# Annotate the frame
cvzone.putTextRect(img, f'{currentClass} {conf}', (max(0, x1), max(35, y1)))
# Construct scene description
if detected_objects:
scene_description = " and ".join(detected_objects)
text_to_audio(f"The scene contains: {scene_description}")
cv2.imshow("Image", img)
# Stop on 'q' key press or 'stop' voice command
if cv2.waitKey(1) == ord('q'):
break
command = listen_for_command()
if command and "stop" in command:
text_to_audio("Stopping object detection")
break
cap.release()
cv2.destroyAllWindows()
# Main function
def main():
while True:
command = listen_for_command()
if command:
if "activate" in command:
text_to_audio("Object detection activated.")
object_detection()
elif "exit" in command:
text_to_audio("Exiting the program.")
break
else:
print("Command not recognized. Say 'activate' or 'exit'.")
# Start the program
main()