1. Soul.img_understand支持openai request type. 2. streetview sence默认设置为…

…True.
tsinghua-fib-lab · Apr 16, 2024 · 8f47848 · 8f47848
1 parent 0bb4df1
commit 8f47848
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -32,9 +32,10 @@ llm_request:
     model: xxx
     (api_base): xxx (this is an optional config, if you use opanai and want to use your own backend LLM model, default to "https://api.openai.com/v1")
   img_understand_request:
-    request_type: qwen
+    request_type: openai / qwen
     api_key: xxx
-    model: xxx
+    model: xxx ('gpt-4-turbo' if you use openai)
+    (api_base): same as text_request
   img_generate_request:
     request_type: qwen
     api_key: xxx
@@ -63,11 +64,19 @@ apphub_request:
 
 #### LLM_REQUEST
 - As you can see, the whole CityAgent is based on the LLM, by now, there are three different parts of config items: **text_request**, **img_understand_request** and **img_generate_request**
-- By now, we support [**qwen**](https://tongyi.aliyun.com/) and [**openai**](https://openai.com/)
-    - `Notice: Our environments are basically conducted with qwen. If you prefer to use openai, then you may encounter hardships. AND fell free to issue us.`
-- Get your **api_key** and chooce your **model**
-- If you want to use your backend models, set the **api_base** (only available when using **openai**)
-  - default value: "https://api.openai.com/v1"
+- **text_request**
+  - By now, we support [**qwen**](https://tongyi.aliyun.com/) and [**openai**](https://openai.com/)
+      - `Notice: Our environments are basically conducted with qwen. If you prefer to use openai, then you may encounter hardships. AND fell free to issue us.`
+  - Get your **api_key** and chooce your **model**
+  - If you want to use your backend models, set the **api_base** (only available when using **openai**)
+    - default value: "https://api.openai.com/v1"
+- **img_understand_request**
+  - By now, we support **qwen** and **openai**
+  - If choose **openai**, then the **model** has to be '**gpt-4-turbo**'
+  - If you want to use your backend models, set the **api_base** (only available when using **openai**)
+      - default value: "https://api.openai.com/v1"
+- **img_generate_request**
+  - By now, only [**qwen**] is supported
 
 #### CITYSIM_REQUEST
 - Most of the configuration options in this part are determined, such as **simulator.server**, **map_request.mongo_coll**, **route_request.server**

diff --git a/example/config_template.yaml b/example/config_template.yaml
@@ -5,9 +5,10 @@ llm_request:
     model: xxx
     (api_base): xxx (this is an optional config, if you use opanai and want to use your own backend LLM model, default to "https://api.openai.com/v1")
   img_understand_request:
-    request_type: qwen
+    request_type: openai / qwen
     api_key: xxx
-    model: xxx
+    model: xxx ('gpt-4-turbo' if you use openai)
+    (api_base): same as text_request
   img_generate_request:
     request_type: qwen
     api_key: xxx

diff --git a/pycityagent/brain/sence.py b/pycityagent/brain/sence.py
@@ -146,10 +146,10 @@ def __init__(self, agent, sence_raduis:int=10) -> None:
         SencePlug Buffer: used to store those sence plug content
         """
 
-        self.enable_streeview = False
+        self.enable_streeview = True
         """
-        街景感知功能接口, 默认为False
-        Interface of streetview function, defualt: False
+        街景感知功能接口, 默认为True
+        Interface of streetview function, defualt: True
         """
 
         self._lane_type_mapping = {1: 'driving', 2: 'walking'}

diff --git a/pycityagent/urbanllm/urbanllm.py b/pycityagent/urbanllm/urbanllm.py
@@ -3,12 +3,16 @@
 from openai import OpenAI
 from http import HTTPStatus
 import dashscope
-from urllib.parse import urlparse, unquote
-from pathlib import PurePosixPath
 import requests
 from dashscope import ImageSynthesis
 from PIL import Image
 from io import BytesIO
+from typing import Union
+import base64
+
+def encode_image(image_path):
+  with open(image_path, "rb") as image_file:
+    return base64.b64encode(image_file.read()).decode('utf-8')
 
 class LLMConfig:
     """
@@ -84,37 +88,85 @@ def text_request(self, dialog:list[dict], temperature:float=1, max_tokens:int=No
             print("ERROR: Wrong Config")
             return "wrong config"
 
-    def img_understand(self, img_path:str, prompt:str=None) -> str:
+    def img_understand(self, img_path:Union[str, list[str]], prompt:str=None) -> str:
         """
         图像理解
         Image understanding
 
         Args:
-        - img_path: 目标图像的路径. The path of selected Image
-        - prompt: 理解提示词 - 例如理解方向. The understanding prompts
+        - img_path (Union[str, list[str]]): 目标图像的路径, 既可以是一个路径也可以是包含多张图片路径的list. The path of selected Image
+        - prompt (str): 理解提示词 - 例如理解方向. The understanding prompts
 
         Returns:
         - (str): the understanding content
         """
         ppt = "如何理解这幅图像？"
         if prompt != None:
             ppt = prompt
-        dialog = [{
-            'role': 'user',
-            'content': [
-                {'image': 'file://' + img_path},
-                {'text': ppt}
-            ]
-        }]
-        response = dashscope.MultiModalConversation.call(
-                    model=self.config.image_u['model'],
-                    api_key=self.config.image_u['api_key'],
-                    messages=dialog
-                )
-        if response.status_code == HTTPStatus.OK:
-            return response.output.choices[0]['message']['content']
+        if self.config.image_u['request_type'] == 'openai':
+            if 'api_base' in self.config.image_u.keys():
+                api_base = self.config.image_u['api_base']
+            else:
+                api_base = None
+            client = OpenAI(
+                api_key=self.config.text['api_key'], 
+                base_url=api_base,
+            )
+            content = []
+            content.append({'type': 'text', 'text': ppt})
+            if isinstance(img_path, str):
+                base64_image = encode_image(img_path)
+                content.append({
+                    'type': 'image_url', 
+                    'image_url': {
+                            'url': f"data:image/jpeg;base64,{base64_image}"
+                        }
+                })
+            elif isinstance(img_path, list) and all(isinstance(item, str) for item in img_path):
+                for item in img_path:
+                    base64_image = encode_image(item)
+                    content.append({
+                        'type': 'image_url',
+                        'image_url': {
+                            'url': f"data:image/jpeg;base64,{base64_image}"
+                        }
+                    })
+            response = client.chat.completions.create(
+                model=self.config.image_u['model'],
+                messages=[{
+                    'role': 'user',
+                    'content': content
+                }]
+            )
+            return response.choices[0].message.content
+        elif self.config.image_u['request_type'] == 'qwen':
+            content = []
+            if isinstance(img_path, str):
+                content.append({'image': 'file://' + img_path})
+                content.append({'text': ppt})
+            elif isinstance(img_path, list) and all(isinstance(item, str) for item in img_path):
+                for item in img_path:
+                    content.append({
+                        'image': 'file://' + item
+                    })
+                content.append({'text': ppt})
+
+            dialog = [{
+                'role': 'user',
+                'content': content
+            }]
+            response = dashscope.MultiModalConversation.call(
+                        model=self.config.image_u['model'],
+                        api_key=self.config.image_u['api_key'],
+                        messages=dialog
+                    )
+            if response.status_code == HTTPStatus.OK:
+                return response.output.choices[0]['message']['content']
+            else:
+                print(response.code)  # The error code.
+                return "Error"
         else:
-            print(response.code)  # The error code.
+            print("ERROR: wrong image understanding type, only 'openai' and 'openai' is available")
             return "Error"
 
     def img_generate(self, prompt:str, size:str='512*512', quantity:int = 1):