[feat] add ai (#22)

Co-authored-by: Juyeonnn <[email protected]>
gdg-yonsei · Feb 18, 2024 · c4bc962 · c4bc962
1 parent 1831dbb
commit c4bc962
Show file tree

Hide file tree

Showing 18 changed files with 3,267 additions and 0 deletions.
diff --git a/ai/README.md b/ai/README.md
@@ -0,0 +1,90 @@
+## Data Generation ##
+
+**1) 문장 구사** : ```fluentify.ProDataGen(iterarion_num=30)```
+
+
+```data/con-data.json``` : 문장구사평가를 위한 데이터 (```Gemini-pro-vision```)
+
+```data/con-img-pool.json```  : 문장구사평가 데이터 생성에 사용된 [이미지 데이터](https://huggingface.co/datasets/ehristoforu/dalle-3-images) (```Dalle-3``` ) 
+
+Example 
+```js
+
+[
+    {
+        "context": "Let's imagine that you are a brave captain of a big ship. You are sailing on the high seas. Suddenly, you see a beautiful sunset. Look at this picture and tell me...",
+        "question": "What colors can you see in the sky?",
+        "answer": "I see red, orange, yellow, and blue.",
+        "img": "1070.jpg"
+    },
+   ...
+]
+```
+**2) 발음** : ```fluentify.ConDataGen(iterarion_num=30)```
+
+
+```data/pro-data.json``` : 발음평가를 위한 데이터 (```Gemini-pro```)
+
+```data/pro-topic-pool.json```  : 발음평가 데이터 생성에 사용된 주제 데이터 (```Gemini-pro```)
+
+Example 
+```js
+[
+    {
+        "practice-sentence": "I love to mix baking soda and vinegar together to create a fizzy experiment.",
+        "tip": "Remember to say 'mix' with your lips together and 'fizzy' with a big smile."
+    },
+  ...
+]
+```
+
+
+
+## Feedback Generation ##
+
+**1) 문장 구사** : ```fluentify.ConFeedback(con_input)```
+
+
+Input
+```js
+{
+    "user-answer": "Hmm.. a ship? maybe yellow? I may no",
+    "context": "Let's imagine that you are a brave captain of a big ship. You are sailing on the high seas. Suddenly, you see a beautiful sunset. Look at this picture and tell me...",
+    "question": "What colors can you see in the sky?",
+    "answer": "I see red, orange, yellow, and blue.",
+    "img": "1070.jpg"
+}
+```
+Output
+```js
+{
+    'positive-feedback': 'You are very creative! I like your imagination.', 
+    'negative-feedback': "Let's try to describe what we see in the picture. First, look at the sky. What colors can you see there?",
+    'enhanced-answer': 'In the sky, I can see yellow, orange, pink, and blue.'
+}
+```
+
+
+**2) 발음** : ```fluentify.ProFeedback(pro_input)```
+
+Input 
+```js
+{
+      "user-audio" : "example1.m4a",
+      "practice-sentece": "It's autumn now, and the leaves are turning beautiful colors.",
+      "tip": "Say 'aw-tum,' not 'ay-tum.'"
+}
+```
+Output
+```js
+{
+    'transcription': 'ITS AUTUMN NOW AND THE LEAVES ARE TURNING BEAUTIFUL COLORS', 
+    'wrong_idx': {'minor': [2, 9], 'major': []}, 
+    'pronunciation_score': 0.7, 
+    'decibel': 46.90759735625882, 
+    'speech_rate': 2.347417840375587, 
+    'positive-feedback': 'Pronunciation is correct. Keep up the good work!', 
+    'negative-feedback': ' '
+}
+```
+----
diff --git a/ai/data-gen.py b/ai/data-gen.py
@@ -0,0 +1,133 @@
+import os
+import requests
+import yaml 
+import json
+import vertexai
+import numpy as np 
+from vertexai.preview.generative_models import GenerativeModel, Part
+import random
+import ast
+
+import torch
+from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC
+import torch
+from jiwer import wer
+import os 
+import json
+import math
+import torch
+import librosa
+from evaluate import load
+from jiwer import compute_measures
+import numpy as np
+from utils.word_process import get_wrong_idx
+
+
+# gcloud auth application-default login
+# gcloud auth application-default set-quota-project fluentify-412312
+
+class DataGeneration:
+    def __init__(self):
+        vertexai.init(project="fluentify-412312", location="asia-northeast3")
+        self.multimodal_model = GenerativeModel("gemini-pro-vision")
+        self.lang_model = GenerativeModel("gemini-pro")
+        self.current_path = os.path.dirname(__file__)
+        self.gcs_path = "gs://fluentify-412312.appspot.com"
+        with open(os.path.join(self.current_path,'data/prompt.yaml'), 'r', encoding='UTF-8') as file:
+            self.prompt = yaml.load(file, Loader=yaml.FullLoader)
+
+        self.audio_path = "./data/audio"
+        self.tokenizer =  AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
+        self.model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+        self.feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+        self.wer = load("wer")
+
+        self.speech_rate_threshol_h = 2.5
+        self.speech_rate_threshol_l = 1.0
+        self.decibel_threshold_h = 95
+        self.decibel_threshold_l = 45
+
+    def GenSent(self, topic):
+        topic = " ".join(topic) if type(topic) == list else topic
+        # print('used topic:', topic)
+        # print(self.prompt['gen-sent'])
+        prompt = f"{self.prompt['gen-sent']}".format(topic=topic)
+        response =self.lang_model.generate_content(prompt)
+        response=response.text.replace("```json","")
+        response=response.replace("```","")
+
+        try : 
+            return ast.literal_eval(response)
+        except:
+            return None 
+
+    def ImgFilter(self, img):
+        image = Part.from_uri(f"{self.gcs_path}/img/{img}", mime_type="image/jpeg")
+        prompt = self.prompt['img-filter']
+        response = self.multimodal_model.generate_content([prompt, image])
+        output = False if  "No" in response.text  else True
+        # print(output)
+        return output
+
+    def GenQA(self, img):
+        image = Part.from_uri(f"{self.gcs_path}/img/{img}", mime_type="image/jpeg")
+        prompt = self.prompt['gen-qa']
+        response = self.multimodal_model.generate_content([prompt, image])
+        # print(image, response.text)
+        try : 
+            return ast.literal_eval((response.text).strip())
+        except:
+            return None 
+
+    def ConDataGen(self, iterate_num):
+        data_path = f"{self.current_path}/data/con-data.json"
+        img_pool_path = f"{self.current_path}/data/con-img-pool.json"
+        with open(img_pool_path) as f:
+            img_pool = json.load(f) 
+        with open(data_path) as f:
+            con_data = json.load(f) 
+
+        # Generate Questions and Answers for ConText Evaluation
+        for i in range(iterate_num):
+            img = random.sample(img_pool, 1)
+            img = img[0] if type(img) == list else img
+
+            img_pool.remove(img) 
+            with open(img_pool_path,'w') as f:
+                    json.dump(img_pool, f,indent=4)
+
+
+            filter = self.ImgFilter(img)
+            if filter:
+                qa = self.GenQA(img) 
+                # if parsing is successful
+                if qa:
+                    qa.update({"img": img}) 
+                    print(qa)
+                    con_data += [qa]
+                    with open(data_path, "w") as f:
+                        json.dump(con_data, f,indent=4)
+            else:
+                print(f"Image {img} is not suitable for the context evaluation.")
+        return con_data
+
+    def ProDataGen(self, iterate_num):
+        data_path = f"{self.current_path}/data/pro-data.json"
+        topic_path = f"{self.current_path}/data/pro-topic-pool.json"
+        with open(topic_path) as f:
+            topic_pool = json.load(f) 
+        with open(data_path) as f:
+            pro_data = json.load(f) 
+
+        # Generate Sentences and Tips for Pronunciation Evaluation
+        for i in range(iterate_num):
+            topic = np.random.choice(topic_pool, size=5, replace=False)
+            sent = self.GenSent(topic) 
+            # if parsing is successful
+            if sent  : 
+                print(sent)
+                pro_data += sent
+                with open(data_path, "w") as f:
+                    json.dump(pro_data, f,indent=4)
+        return pro_data
+
diff --git a/ai/data/audio/example1.m4a b/ai/data/audio/example1.m4a
diff --git a/ai/data/audio/example2.m4a b/ai/data/audio/example2.m4a
diff --git a/ai/data/character/close-1.png b/ai/data/character/close-1.png
diff --git a/ai/data/character/open-0.png b/ai/data/character/open-0.png
diff --git a/ai/data/character/open-1.png b/ai/data/character/open-1.png
diff --git a/ai/data/character/temp.png b/ai/data/character/temp.png
diff --git a/ai/data/con-data.json b/ai/data/con-data.json
@@ -0,0 +1,110 @@
+[
+    {
+        "context": "Let's imagine that you are a brave captain of a big ship. You are sailing on the high seas. Suddenly, you see a beautiful sunset. Look at this picture and tell me...",
+        "question": "What colors can you see in the sky?",
+        "answer": "I see red, orange, yellow, and blue.",
+        "img": "1070.jpg"
+    },
+    {
+        "context": "Look at this picture. This is a woman. She is wearing a red turtleneck blouse and black pants. She has curly blond hair and brown eyes. She is standing in front of a window.",
+        "question": "What is she wearing?",
+        "answer": "She is wearing a red turtleneck blouse and black pants.",
+        "img": "486.jpg"
+    },
+    {
+        "context": "Look at this picture. What do you see?",
+        "question": "Can you describe what is happening in the picture?",
+        "answer": "I see a robot walking through a forest. The sky is a beautiful color. The trees are tall and green.",
+        "img": "185.jpg"
+    },
+    {
+        "context": "Look at this picture. This is a girl. Her name is Jane. She is sitting on a rock. She is wearing a purple outfit and brown shoes. Her hair is long and brown. She looks very peaceful.",
+        "question": "What is Jane doing?",
+        "answer": "Jane is sitting on a rock.",
+        "img": "1629.jpg"
+    },
+    {
+        "context": "Look at this cute kitty! What do you see in the picture?",
+        "question": "What is the kitty doing?",
+        "answer": "The kitty is sitting on a log.",
+        "img": "201.jpg"
+    },
+    {
+        "context": "Look at this beautiful landscape. Imagine you are standing on that rock. What can you see?",
+        "question": "What is she standing on?",
+        "answer": "She is standing on a rock.",
+        "img": "1116.jpg"
+    },
+    {
+        "context": "Let's imagine that you are a famous car designer and you have been asked to design a car for a superhero. This superhero has the power to control metal with his mind. What would be the most important thing to keep in mind when designing this car?",
+        "question": "What would be the most important thing to keep in mind when designing this car?",
+        "answer": "The most important thing to keep in mind when designing this car would be to make sure that it is made of a metal that the superhero can control with his mind.",
+        "img": "218.jpg"
+    },
+    {
+        "context": "Look at this picture. It's raining outside and the girl is wearing a black jacket. She has long brown hair and it's tied in a ponytail. She is standing in the woods and looks very peaceful.",
+        "question": "What do you think she is thinking about?",
+        "answer": "She is thinking about her family.",
+        "img": "506.jpg"
+    },
+    {
+        "context": "Look at this picture. This is a girl. She has beautiful blue eyes and freckles on her face. She is looking at us with a curious expression. What do you think she is thinking?",
+        "question": "What is the girl thinking?",
+        "answer": "She is thinking about something interesting.",
+        "img": "1483.jpg"
+    },
+    {
+        "context": "Look at this beautiful tree! It has so many branches and leaves. It's like a whole world in one tree. Do you see the little house in the middle? That's where the tree people live. They are very friendly and love to play. Let's make up a story about them. What do you think they are doing right now?",
+        "question": "What do you see in the picture?",
+        "answer": "I see a tree with a house in the middle. There are also some mountains and a river.",
+        "img": "284.jpg"
+    },
+    {
+        "context": "Look at this picture. This is Tarzan. He is a fictional character created by Edgar Rice Burroughs. Tarzan was raised by apes in the African jungle. He is a very strong and agile man. He can swing from trees and fight off wild animals. Tarzan is a very interesting character. He is kind and gentle, but he is also very brave and strong.",
+        "question": "What is Tarzan wearing?",
+        "answer": "He is wearing a loincloth.",
+        "img": "1206.jpg"
+    },
+    {
+        "context": "Look at this cozy bedroom! It's so peaceful and quiet. The perfect place to relax and read a book.",
+        "question": "What do you think is the best thing about this bedroom?",
+        "answer": "The best thing about this bedroom is the view of the city outside the window.",
+        "img": "1616.jpg"
+    },
+    {
+        "context": "A boy is standing on a cliff. He is looking at a beautiful landscape. There are mountains, trees, and a lake. The sky is blue, and there are two moons in the sky. The boy is amazed by the beauty of the landscape.",
+        "question": "What is the boy doing?",
+        "answer": "The boy is standing on a cliff and looking at a beautiful landscape.",
+        "img": "926.jpg"
+    },
+    {
+        "context": "Look at this beautiful picture. What do you see?",
+        "question": "Can you describe what is happening in the picture?",
+        "answer": "The picture shows a mountain valley with a river running through it. The river is wide and shallow, and it flows over rocks and stones. The valley is green and lush, and there are trees and bushes on the banks of the river. There are mountains in the background, and the sun is shining.",
+        "img": "1316.jpg"
+    },
+    {
+        "context": "Look at this picture. This is a silhouette of a girl. She looks just like you! She has a head, two arms, and two legs. What do you think she is doing?",
+        "question": "What do you think she is looking at?",
+        "answer": "She is looking at a beautiful sunset.",
+        "img": "502.jpg"
+    },
+    {
+        "context": "Look at this beautiful picture. This is a portrait of a young woman. She has long brown hair and brown eyes. She is wearing a white dress and a gold earring.",
+        "question": "What is the woman wearing?",
+        "answer": "The woman is wearing a white dress and a gold earring.",
+        "img": "1047.jpg"
+    },
+    {
+        "context": "Hi there! I'm an education specialist, and I'm here to help you with your language skills. I'm going to show you a picture, and then I'm going to ask you a question about it. Are you ready?",
+        "question": "What do you see in the picture?",
+        "answer": "I see a motorcycle.",
+        "img": "205.jpg"
+    },
+    {
+        "context": "Look at this beautiful picture! What do you see?",
+        "question": "Can you describe what you see in the picture?",
+        "answer": "I see a lot of stars and clouds in the picture. The colors are very bright and pretty.",
+        "img": "1583.jpg"
+    }
+]