Skip to content

Commit

Permalink
Add voice interface
Browse files Browse the repository at this point in the history
  • Loading branch information
jerhadf committed Oct 31, 2024
1 parent 832711c commit 3bca621
Show file tree
Hide file tree
Showing 3 changed files with 199 additions and 63 deletions.
145 changes: 84 additions & 61 deletions computer_use_demo/streamlit.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
sampling_loop,
)
from computer_use_demo.tools import ToolResult
from computer_use_demo.voice_interface import VoiceInterface

CONFIG_DIR = PosixPath("~/.anthropic").expanduser()
API_KEY_FILE = CONFIG_DIR / "api_key"
Expand Down Expand Up @@ -98,7 +99,7 @@ async def main():

st.markdown(STREAMLIT_STYLE, unsafe_allow_html=True)

st.title("Claude Computer Use Demo")
st.title("Computer Control Interface")

if not os.getenv("HIDE_WARNING", False):
st.warning(WARNING_TEXT)
Expand Down Expand Up @@ -147,73 +148,95 @@ def _reset_api_provider():
else:
st.session_state.auth_validated = True

chat, http_logs = st.tabs(["Chat", "HTTP Exchange Logs"])
new_message = st.chat_input(
"Type a message to send to Claude to control the computer..."
# Initialize voice interface
voice_interface = VoiceInterface(
anthropic_key=os.getenv("ANTHROPIC_API_KEY"),
hume_key=os.getenv("HUME_API_KEY")
)

with chat:
# render past chats
for message in st.session_state.messages:
if isinstance(message["content"], str):
_render_message(message["role"], message["content"])
elif isinstance(message["content"], list):
for block in message["content"]:
# the tool result we send back to the Anthropic API isn't sufficient to render all details,
# so we store the tool use responses
if isinstance(block, dict) and block["type"] == "tool_result":
_render_message(
Sender.TOOL, st.session_state.tools[block["tool_use_id"]]
)
else:
_render_message(
message["role"],
cast(BetaTextBlock | BetaToolUseBlock, block),
)

# render past http exchanges
for identity, response in st.session_state.responses.items():
_render_api_response(response, identity, http_logs)

# render past chats
if new_message:
st.session_state.messages.append(
{
"role": Sender.USER,
"content": [TextBlock(type="text", text=new_message)],
}
)
_render_message(Sender.USER, new_message)
chat, http_logs = st.tabs(["Chat", "HTTP Exchange Logs"])

try:
most_recent_message = st.session_state["messages"][-1]
except IndexError:
return
with chat:
# Add voice controls
voice_interface.render_voice_controls()

if most_recent_message["role"] is not Sender.USER:
# we don't have a user message to respond to, exit early
return
# Start voice connection in background
voice_task = asyncio.create_task(voice_interface.start_voice_connection())

with st.spinner("Running Agent..."):
# run the agent sampling loop with the newest message
st.session_state.messages = await sampling_loop(
system_prompt_suffix=st.session_state.custom_system_prompt,
model=st.session_state.model,
provider=st.session_state.provider,
messages=st.session_state.messages,
output_callback=partial(_render_message, Sender.BOT),
tool_output_callback=partial(
_tool_output_callback, tool_state=st.session_state.tools
),
api_response_callback=partial(
_api_response_callback,
tab=http_logs,
response_state=st.session_state.responses,
),
api_key=st.session_state.api_key,
only_n_most_recent_images=st.session_state.only_n_most_recent_images,
try:
# Continue with rest of Streamlit UI
new_message = st.chat_input(
"Type or speak a message to control the computer..."
)

if new_message:
await voice_interface.handle_voice_input(new_message)

# render past chats
for message in st.session_state.messages:
if isinstance(message["content"], str):
_render_message(message["role"], message["content"])
elif isinstance(message["content"], list):
for block in message["content"]:
# the tool result we send back to the Anthropic API isn't sufficient to render all details,
# so we store the tool use responses
if isinstance(block, dict) and block["type"] == "tool_result":
_render_message(
Sender.TOOL, st.session_state.tools[block["tool_use_id"]]
)
else:
_render_message(
message["role"],
cast(BetaTextBlock | BetaToolUseBlock, block),
)

# render past http exchanges
for identity, response in st.session_state.responses.items():
_render_api_response(response, identity, http_logs)

# render past chats
if new_message:
st.session_state.messages.append(
{
"role": Sender.USER,
"content": [TextBlock(type="text", text=new_message)],
}
)
_render_message(Sender.USER, new_message)

try:
most_recent_message = st.session_state["messages"][-1]
except IndexError:
return

if most_recent_message["role"] is not Sender.USER:
# we don't have a user message to respond to, exit early
return

with st.spinner("Running Agent..."):
# run the agent sampling loop with the newest message
st.session_state.messages = await sampling_loop(
system_prompt_suffix=st.session_state.custom_system_prompt,
model=st.session_state.model,
provider=st.session_state.provider,
messages=st.session_state.messages,
output_callback=partial(_render_message, Sender.BOT),
tool_output_callback=partial(
_tool_output_callback, tool_state=st.session_state.tools
),
api_response_callback=partial(
_api_response_callback,
tab=http_logs,
response_state=st.session_state.responses,
),
api_key=st.session_state.api_key,
only_n_most_recent_images=st.session_state.only_n_most_recent_images,
)
finally:
# Clean up voice connection
if voice_task and not voice_task.done():
voice_task.cancel()


def validate_auth(provider: APIProvider, api_key: str | None):
if provider == APIProvider.ANTHROPIC:
Expand Down
100 changes: 100 additions & 0 deletions computer_use_demo/voice_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from typing import Optional, cast

Check failure on line 1 in computer_use_demo/voice_interface.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

computer_use_demo/voice_interface.py:1:20: F401 `typing.Optional` imported but unused

Check failure on line 1 in computer_use_demo/voice_interface.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

computer_use_demo/voice_interface.py:1:30: F401 `typing.cast` imported but unused
import asyncio
import streamlit as st
from hume.client import AsyncHumeClient
from hume.empathic_voice.chat.socket_client import ChatConnectOptions
from hume import MicrophoneInterface, Stream
from anthropic.types.beta import BetaContentBlock
import os

from .loop import sampling_loop, APIProvider, PROVIDER_TO_DEFAULT_MODEL_NAME

class VoiceInterface:

Check failure on line 12 in computer_use_demo/voice_interface.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

computer_use_demo/voice_interface.py:1:1: I001 Import block is un-sorted or un-formatted
def __init__(self, anthropic_key: str, hume_key: str):
self.anthropic_key = anthropic_key
self.hume_client = AsyncHumeClient(api_key=hume_key)
self.socket = None
self.byte_stream = Stream.new()

def init_streamlit_state(self):
if "messages" not in st.session_state:
st.session_state.messages = []
if "tools" not in st.session_state:
st.session_state.tools = {}
if "is_recording" not in st.session_state:
st.session_state.is_recording = False

async def handle_assistant_response(self, content: BetaContentBlock):
if hasattr(content, 'text'):
# Send Claude's response to EVI for voice synthesis
if self.socket:
await self.socket.send_assistant_message(content.text)

async def handle_voice_input(self, text: str):
st.session_state.messages.append({
"role": "user",
"content": [{"type": "text", "text": text}]
})

# Use existing sampling loop
st.session_state.messages = await sampling_loop(
model=PROVIDER_TO_DEFAULT_MODEL_NAME[APIProvider.ANTHROPIC],
provider=APIProvider.ANTHROPIC,
system_prompt_suffix="You are being controlled by voice or text commands.",
messages=st.session_state.messages,
output_callback=self.handle_assistant_response,
tool_output_callback=self.handle_tool_output,
api_response_callback=lambda x: None,
api_key=self.anthropic_key
)


async def handle_tool_output(self, result, tool_id):
st.session_state.tools[tool_id] = result

def render_voice_controls(self):
col1, col2 = st.columns([1, 4])
with col1:
if st.button("🎤", key="mic_button"):
st.session_state.is_recording = not st.session_state.is_recording

with col2:
if st.session_state.is_recording:
st.write("Recording... Click 🎤 to stop")
else:
st.write("Click 🎤 to start speaking")

async def start_voice_connection(self):
options = ChatConnectOptions(

Check failure on line 68 in computer_use_demo/voice_interface.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F841)

computer_use_demo/voice_interface.py:68:9: F841 Local variable `options` is assigned to but never used
config_id=os.getenv("HUME_CONFIG_ID"),
secret_key=os.getenv("HUME_SECRET_KEY")
)

try:
async with self.hume_client.empathic_voice.chat.connect() as socket:
self.socket = socket

# Set up message handler
async def on_message(msg):
if msg.type == "user_message":
await self.handle_voice_input(msg.message.content)

# Subscribe to socket messages
await socket.subscribe(on_message)

# Start microphone interface when recording
while True:
if st.session_state.is_recording:
await MicrophoneInterface.start(
socket,
allow_user_interrupt=True,
byte_stream=self.byte_stream
)
await asyncio.sleep(0.1)
except Exception as e:
st.error(f"Voice connection error: {str(e)}")
st.session_state.is_recording = False

async def start(self):
self.init_streamlit_state()
await self.start_voice_connection()

Check failure on line 100 in computer_use_demo/voice_interface.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (W292)

computer_use_demo/voice_interface.py:100:44: W292 No newline at end of file
17 changes: 15 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,20 @@
import asyncio
import os
from dotenv import load_dotenv
from computer_use_demo.voice_interface import VoiceInterface

from computer_use_demo.streamlit import main
async def main():

Check failure on line 6 in main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

main.py:1:1: I001 Import block is un-sorted or un-formatted
load_dotenv()

interface = VoiceInterface(
anthropic_key=os.getenv("ANTHROPIC_API_KEY"),
hume_key=os.getenv("HUME_API_KEY")
)

try:
await interface.start()
except KeyboardInterrupt:
print("\nShutting down...")

Check failure on line 17 in main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (T201)

main.py:17:9: T201 `print` found

if __name__ == "__main__":
asyncio.run(main())
asyncio.run(main())

Check failure on line 20 in main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (W292)

main.py:20:24: W292 No newline at end of file

0 comments on commit 3bca621

Please sign in to comment.