-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
206 lines (183 loc) · 9.87 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# Install transformers from source - only needed for versions <= v4.34
# pip install git+https://github.com/huggingface/transformers.git
# pip install accelerate
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoProcessor
import pandas as pd
import numpy as np
from error_code_fixer import error_code_fixer, double_check
import geopandas as gpd
from shapely.geometry import Point
import logging
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
logging.getLogger('transformers').setLevel(logging.ERROR)
models = {}
# model_name, kode = "mistralai/Mistral-7B-Instruct-v0.2", 'mistral2'
# models[kode] = model_name
# model_name, kode = "HuggingFaceH4/zephyr-7b-beta", 'zephyr'
# models[kode] = model_name
# model_name, kode = "teknium/OpenHermes-2.5-Mistral-7B", 'openhermes'
# models[kode] = model_name
# model_name, kode = "upstage/SOLAR-10.7B-Instruct-v1.0", 'solar'
# models[kode] = model_name
# model_name, kode = "mistralai/Mistral-7B-Instruct-v0.3", 'mistral3'
# models[kode] = model_name
# model_name, kode = "meta-llama/Meta-Llama-3-8B-Instruct", 'llama'
# models[kode] = model_name
# model_name, kode = "gradientai/Llama-3-8B-Instruct-Gradient-1048k", "gradient"
# models[kode] = model_name
model_name, kode = "bigcode/starcoder2-15b-instruct-v0.1", "starcoder"
models[kode] = model_name
# pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.bfloat16, device_map="auto", token="hf_XXXXXXX")
# file_path = "/home/fkriskov/diplomski/datasets/IRIS flower dataset.csv"
file_path = "/home/fkriskov/diplomski/datasets/lucas-soil-2018-v2/lucas-soil-2018 copy.csv"
europe = gpd.read_file("/home/fkriskov/diplomski/datasets/Europe_coastline_shapefile/Europe_coastline.shp")
df = pd.read_csv(file_path)
# geometry = [Point(xy) for xy in zip(df['TH_LONG'], df['TH_LAT'])]
# geo_df = gpd.GeoDataFrame(df, geometry=geometry)
# geo_df.set_crs(epsg=4326, inplace=True)
# if europe.crs != geo_df.crs:
# geo_df = geo_df.to_crs(europe.crs)
# geo_df.to_file('/home/fkriskov/diplomski/datasets/geo_dataframe.shp')
instruction_str = (
"1. Convert the query to executable Python code using Pandas.\n"
"2. The solution should be a Python expression that can be called with the `exec()` function, inside '```python ```'.\n"
"3. The code should represent a solution to the query.\n"
"4. If not instructed otherwise, print the final result variable.\n"
"5. If you are asked to plot something, save it as a plot.png.\n"
"6. Don't explain the code.\n"
)
df_str = df.head()
codes = list(set(df['NUTS_0']))
columns = list(df.columns)
europe_columns = europe.columns
# user_query, filename = "Which land type (LC0_Desc) has the highest average 'pH_H2O'.", "desc_1_"
# user_query, filename = "Plot the average 'OC' for each land type (LC0_Desc). save it as a png", "desc_2_"
# user_query, filename = "Is there a significant relationship between land type (LC0_Desc) and pH_H2O? Use chi square from scipy.", "infer_1_"
# user_query, filename = "Perform a t-test to compare 'K' between Grassland and Cropland.", "infer_4_"
# user_query, filename = "Plot a linear regression analysis to see the relationship between 'pH_H2O' and 'K'. save it as a png", "infer_5_"
# user_query, filename = "Construct a 95% \confidence interval for the mean 'OC' content in the dataset.", "infer_6_"
# user_query, filename = "Using the Central Limit Theorem, simulate the sampling distribution of the mean 'pH_H2O' for sample sizes of 30. Plot the distribution and compare it to the normal distribution.", "infer_7_"
# user_query, filename = "Calculate the z-scores for 'EC' and identify any outliers (z-score > 3 or < -3). use zscore from scipy", "infer_8_"
# user_query, filename = "Perform a hypothesis test to determine if the mean 'K' content in the entire dataset is significantly different from 2%. Use a t-test for the hypothesis test.", "infer_9_"
# user_query, filename = "Calculate the p-value for the correlation between 'P' and 'K'. Determine if the correlation is statistically significant.", "infer_10_"
# user_query, filename = "Plot the points with 'pH_H2O'>5 blue and 'pH_H2O'<5 red in Europe. save it as a png", "geo_8_"
# user_query, filename = "Create a map displaying the distribution of soil types ('LC0_Desc') across Europe. Each soil type should be represented by a different color. Use geopandas and save the map as a png.", "geo_9_"
# user_query, filename = "Plot all the LC0_Desc='Grassland' and LC0_Desc='Woodland' points where 'OC'>20. Use geopandas and save the map as a png.", "geo_10_"
# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
#"You are working with a CSV file that is located in {file_path}"
# "You are working with a GeoDataFrame that is located in '/home/fkriskov/diplomski/datasets/geo_dataframe.shp'."
# "Plot the Europe shapefile that is located in '/home/fkriskov/diplomski/datasets/Europe_coastline_shapefile/Europe_coastline.shp'."
# "You cannot merge these shapefiles,just plot them."
# "set marker='.' and figsize (10,10)"
# "These are the columns of the dataframe:"
# {columns}
# "These are the columns of the Europe Shapefile:"
# {europe_columns}
messages = [
{
"role": "user",
"content": f"""
"You are working with a GeoDataFrame that is located in '/home/fkriskov/diplomski/datasets/geo_dataframe.shp'."
"Plot the Europe shapefile that is located in '/home/fkriskov/diplomski/datasets/Europe_coastline_shapefile/Europe_coastline.shp'."
"You cannot merge these shapefiles,just plot them."
"set marker='.' and figsize (10,10)"
"These are the columns of the dataframe:"
{columns}
"These are the columns of the Europe Shapefile:"
{europe_columns}
"'NUTS_0' is Alpha-2 code."
"The possible NUTS_0 codes are: {codes}"
"And this is the head of the dataframe:"
"{df_str}\n"
"Follow these instructions:"
"{instruction_str}"
"answer this query:"
"{user_query}\n"
"""
},
# desc_1
# {"role": "user", "content": "What is the average petal length of top 30 longest sepals?"}, # 5.623333333333333
# infer_1
# {"role": "user", "content": "Is there a significant relationship between species and sepal length? Use chi square from scipy."},
# desc_2
# {"role": "user", "content": "Plot the average petal length for each iris species."},
# desc_3
# {"role": "user", "content": "Calculate the average pH for south EU."},
# desc_4
# {"role": "user", "content": "Calculate the average pH for Austria, from the mentioned csv."}, #5.302227171492205
# desc_5
# {"role": "user", "content": "Calculate the max value of 'N' for Slovenia, from the mentioned csv."},
# infer_2
# {"role": "user", "content": "Is there a significant difference between 'N' in Austria and France? Use ANOVA from scipy."},
# infer_3
# {"role": "user", "content": "Which parameter has the strongest correlation with EC among {pH_CaCl2, pH_H2O, OC, CaCO3, P, N, K}?"},
# no name
# {"role": "user", "content": "plot linear regressions with EC and pH_CaCl2, pH_H2O, OC, CaCO3, P, N and K, respectively. Use sklearn."},
# geo_1
# {"role": "user", "content": "plot all the points that have pH_CaCl2 > 6. use geopandas. save the image as a png."},
# geo_2
# {"role": "user", "content": "plot all the points with LC0_Desc=Woodland in Europe. Save the result as a png. Use geopandas."},
# scatter
# {"role": "user", "content": "plot scatter of EC and pH_CaCl2 and save it as scatter.png."},
# geo_3
# {"role": "user", "content": "plot all the points with LC0_Desc=Woodland & pH<6 in Europe. Save the result as a png. Use geopandas."},
]
for model in models:
model_name = models[model]
kode = model
# input(f"next model?, {kode}")
pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.bfloat16, device_map="auto", token="hf_XXXXXXXX")
invalid_output = True
temp_messages = messages
while invalid_output:
prompt = pipe.tokenizer.apply_chat_template(temp_messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.4, top_k=50, top_p=0.95)
result = outputs[0]["generated_text"]
print(result)
pythonbracketcounter = instruction_str.count('```python') + 1
if result.count("```python") == pythonbracketcounter:
# temp_messages = messages + ["Display only the complete Python solution.\n"]
invalid_output = False
python_extract = result.split("```python")[pythonbracketcounter].split("```")[0]
print("\n------------------GREAT SUCCESS!!!------------------\n")
print(python_extract)
print("\n------------------REZULTAT!!!------------------\n")
kodename = kode
if kode.startswith("mistral"): kodename="mistral"
# with open(os.path.join("/home/fkriskov/diplomski/testing/uspjesni-rezultati", kode, filename+kodename+".txt"), 'w') as f:
# print("WRITING!", filename+kode+".txt")
# f.write(python_extract)
try:
error_occured = False
exec(python_extract)
except Exception as e:
error_occured = True
error = e
print("error occured: ", e)
# double check
# checked_code = double_check(pipe, python_extract, messages)
# # print(checked_code)
# try:
# error_occured = False
# print("\nRunning Code...\n")
# exec(checked_code)
# except Exception as e:
# error_occured = True
# error = e
# print("\nerror occured: ", e, "\n")
# error code fixer
# python_fix_extract = checked_code
# while error_occured:
# python_fix_extract = error_code_fixer(pipe, python_fix_extract, error)
# print("\n------------------FIXED!!!------------------\n")
# print(python_fix_extract)
# print("\n------------------REZULTAT!!!------------------\n")
# try:
# error_occured = False
# exec(python_fix_extract)
# except Exception as e:
# error_occured = True
# error = e
# print("\nerror occured: ", e, "\n")