Skip to content

Commit

Permalink
improved reconstructing class word free prompt
Browse files Browse the repository at this point in the history
  • Loading branch information
bssrdf committed Aug 30, 2024
1 parent cbf3108 commit 19d5939
Showing 1 changed file with 15 additions and 2 deletions.
17 changes: 15 additions & 2 deletions clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,14 @@ class CLIPTokenizer {
}
}

std::string clean_up_tokenization(std::string &text){

std::regex pattern(R"( ,)");
// Replace " ," with ","
std::string result = std::regex_replace(text, pattern, ",");
return result;
}

std::string decode(const std::vector<int>& tokens) {
std::string text = "";
for (int t : tokens) {
Expand All @@ -351,8 +359,12 @@ class CLIPTokenizer {
std::u32string ts = decoder[t];
// printf("%d, %s \n", t, utf32_to_utf8(ts).c_str());
std::string s = utf32_to_utf8(ts);
if (s.length() >= 4 && ends_with(s, "</w>")) {
text += " " + s.replace(s.length() - 4, s.length() - 1, "");
if (s.length() >= 4 ){
if(ends_with(s, "</w>")) {
text += s.replace(s.length() - 4, s.length() - 1, "") + " ";
}else{
text += s;
}
} else {
text += " " + s;
}
Expand All @@ -364,6 +376,7 @@ class CLIPTokenizer {

// std::string s((char *)bytes.data());
// std::string s = "";
text = clean_up_tokenization(text);
return trim(text);
}

Expand Down

0 comments on commit 19d5939

Please sign in to comment.