diff --git a/readme.md b/readme.md index 822673e2..a4e8e599 100644 --- a/readme.md +++ b/readme.md @@ -1,2 +1,6 @@ Kirikiroid2 - A cross-platform port of Kirikiri2/KirikiriZ -========================================================== \ No newline at end of file +========================================================== + +Depend on most code from Kirikiri2 and KirikiriZ(https://github.com/krkrz/krkrz) +Video playback module modified from kodi(https://github.com/xbmc/xbmc) +Some string code from glibc and Apple Libc. \ No newline at end of file diff --git a/src/core/base/TextStream.cpp b/src/core/base/TextStream.cpp index 8d629263..8d48272a 100644 --- a/src/core/base/TextStream.cpp +++ b/src/core/base/TextStream.cpp @@ -231,7 +231,7 @@ class tTVPTextReadStream : public iTJSTextReadStream // check UTF-8 BOM if(mark[0] == 0xef && mark[1] == 0xbb && mark[2] == 0xbf) { // UTF-8 BOM - tjs_uint size = (tjs_uint)(Stream->GetSize()-3); + tjs_uint size = (tjs_uint)(Stream->GetSize() - 3) - ofs; tjs_uint8 *nbuf = new tjs_uint8[size + 1]; try { @@ -254,7 +254,7 @@ class tTVPTextReadStream : public iTJSTextReadStream // ansi/mbcs // read whole and hold it Stream->SetPosition(ofs); - tjs_uint size = (tjs_uint)(Stream->GetSize()); + tjs_uint size = (tjs_uint)(Stream->GetSize()) - ofs; tjs_uint8 *nbuf = new tjs_uint8[size + 1]; try { diff --git a/src/core/base/win32/PluginImpl.h b/src/core/base/win32/PluginImpl.h index 638fb10d..6be157de 100644 --- a/src/core/base/win32/PluginImpl.h +++ b/src/core/base/win32/PluginImpl.h @@ -84,7 +84,7 @@ TJS_EXP_FUNC_DEF(void, TVPThrowPluginUnboundFunctionError, (const char *funcname TJS_EXP_FUNC_DEF(void, TVPThrowPluginUnboundFunctionError, (const tjs_char *funcname)); #endif inline TJS_EXP_FUNC_DEF(void *, TVP_malloc, (size_t size)) { return malloc(size); } -inline TJS_EXP_FUNC_DEF(void *, TVP_realloc, (void *pp, size_t size)) { return realloc(pp, size); } +// inline TJS_EXP_FUNC_DEF(void *, TVP_realloc, (void *pp, size_t size)) { return realloc(pp, size); } inline TJS_EXP_FUNC_DEF(void, TVP_free, (void *pp)) { return free(pp); } TJS_EXP_FUNC_DEF(tjs_int, TVPGetAutoLoadPluginCount, ()); //--------------------------------------------------------------------------- diff --git a/src/core/base/win32/StorageImpl.cpp b/src/core/base/win32/StorageImpl.cpp index 8bf55b4c..507dc498 100644 --- a/src/core/base/win32/StorageImpl.cpp +++ b/src/core/base/win32/StorageImpl.cpp @@ -44,6 +44,29 @@ #define lseek64 lseek #endif +#ifdef WIN32 +typedef struct _stat64 tTVP_stat; +#else +typedef struct ::stat64 tTVP_stat; +#endif + +static bool TVP_stat(const tjs_char *name, tTVP_stat &s) { +#ifdef WIN32 + return !_wstat64(name, &s); +#else + tTJSNarrowStringHolder holder(name); + return !::stat64(holder, &s); +#endif +} +static bool TVP_stat(const char *name, tTVP_stat &s) { +#ifdef WIN32 + ttstr filename(name); + return !_wstat64(filename.c_str(), &s); +#else + return !::stat64(name, &s); +#endif +} + //--------------------------------------------------------------------------- // tTVPFileMedia //--------------------------------------------------------------------------- @@ -126,51 +149,53 @@ tTJSBinaryStream * TJS_INTF_METHOD tTVPFileMedia::Open(const ttstr & name, tjs_u return new tTVPLocalFileStream(origname, _name, flags); } + void TVPListDir(const std::string &folder, std::function cb) { DIR *dirp; struct dirent *direntp; - struct stat stat_buf; + tTVP_stat stat_buf; if ((dirp = opendir(folder.c_str()))) { while ((direntp = readdir(dirp)) != NULL) { std::string fullpath = folder + "/" + direntp->d_name; - if (stat(fullpath.c_str(), &stat_buf) == -1) { + if (!TVP_stat(fullpath.c_str(), stat_buf)) continue; - } cb(direntp->d_name, stat_buf.st_mode); } closedir(dirp); } } -void TVPGetLocalFileListAt(const ttstr &name, iTVPStorageLister *lister, int stat_mask) { +void TVPGetLocalFileListAt(const ttstr &name, const std::function& cb) { DIR *dirp; struct dirent *direntp; - struct stat stat_buf; + tTVP_stat stat_buf; std::string folder(name.AsNarrowStdString()); if ((dirp = opendir(folder.c_str()))) { while ((direntp = readdir(dirp)) != NULL) { std::string fullpath = folder + "/" + direntp->d_name; - if (stat(fullpath.c_str(), &stat_buf) == -1) - { + if (!TVP_stat(fullpath.c_str(), stat_buf)) continue; - } - if (stat_buf.st_mode & stat_mask) + ttstr file(direntp->d_name); + tjs_char *p = file.Independ(); + while (*p) { - ttstr file(direntp->d_name); - tjs_char *p = file.Independ(); - while (*p) - { - // make all characters small - if (*p >= TJS_W('A') && *p <= TJS_W('Z')) - *p += TJS_W('a') - TJS_W('A'); - p++; - } - lister->Add(file); + // make all characters small + if (*p >= TJS_W('A') && *p <= TJS_W('Z')) + *p += TJS_W('a') - TJS_W('A'); + p++; } + tTVPLocalFileInfo info; + info.NativeName = direntp->d_name; + info.Mode = stat_buf.st_mode; + info.Size = stat_buf.st_size; + info.AccessTime = stat_buf.st_atime; + info.ModifyTime = stat_buf.st_mtime; + info.CreationTime = stat_buf.st_ctime; + cb(file, &info); } closedir(dirp); } @@ -210,7 +235,11 @@ void TJS_INTF_METHOD tTVPFileMedia::GetListAt(const ttstr &_name, iTVPStorageLis FindClose(handle); } #endif - TVPGetLocalFileListAt(name, lister, S_IFREG); + TVPGetLocalFileListAt(name, [lister](const ttstr &name, tTVPLocalFileInfo* s) { + if (s->Mode & (S_IFREG)) { + lister->Add(name); + } + }); } static int _utf8_strcasecmp(const char *a, const char *b) { @@ -546,10 +575,8 @@ bool TVPCheckExistentLocalFile(const ttstr &name) else return true; // a file #endif - struct stat s; - tTJSNarrowStringHolder holder(name.c_str()); - if(stat(holder, &s)) - { + tTVP_stat s; + if(!TVP_stat(name.c_str(), s)) { return false; // not exist } return s.st_mode & S_IFREG; @@ -564,7 +591,6 @@ bool TVPCheckExistentLocalFile(const ttstr &name) //--------------------------------------------------------------------------- bool TVPCheckExistentLocalFolder(const ttstr &name) { -#ifdef WIN32 #if 0 DWORD attrib = GetFileAttributes(name.c_str()); if(attrib != 0xffffffff && (attrib & FILE_ATTRIBUTE_DIRECTORY)) @@ -572,18 +598,8 @@ bool TVPCheckExistentLocalFolder(const ttstr &name) else return false; // not a folder #endif - struct _stat s = {0}; - if(_wstat(name.c_str(), &s)) -#else // posix utf-8 - struct stat s = {0}; - tTJSNarrowStringHolder holder(name.c_str()); - char* p = (char*)holder.operator const tjs_nchar *(); - char* t = p; while(*t) ++t; - while(t > p && (t[-1] == '\\' || t[-1] == '/')) --t; - *t = 0; - if(stat(p, &s)) -#endif - { + tTVP_stat s; + if (!TVP_stat(name.c_str(), s)) { return false; // not exist } diff --git a/src/core/base/win32/StorageImpl.h b/src/core/base/win32/StorageImpl.h index fb688d04..cb849c4f 100644 --- a/src/core/base/win32/StorageImpl.h +++ b/src/core/base/win32/StorageImpl.h @@ -26,8 +26,16 @@ void TVPUnloadArchiveSPI(HINSTANCE inst); //--------------------------------------------------------------------------- #endif +struct tTVPLocalFileInfo { + const char * NativeName; + unsigned short Mode; // S_IFMT + tjs_uint64 Size; + time_t AccessTime; + time_t ModifyTime; + time_t CreationTime; +}; -void TVPGetLocalFileListAt(const ttstr &name, iTVPStorageLister *lister, int stat_mask); +void TVPGetLocalFileListAt(const ttstr &name, const std::function& cb); //--------------------------------------------------------------------------- // tTVPLocalFileStream diff --git a/src/core/base/win32/SysInitImpl.cpp b/src/core/base/win32/SysInitImpl.cpp index d415a3ca..90227bcb 100644 --- a/src/core/base/win32/SysInitImpl.cpp +++ b/src/core/base/win32/SysInitImpl.cpp @@ -31,7 +31,7 @@ #include "XP3Archive.h" #include "ScriptMgnIntf.h" #include "XP3Archive.h" -#include "VersionFormUnit.h" +//#include "VersionFormUnit.h" #include "EmergencyExit.h" //#include "tvpgl_ia32_intf.h" @@ -41,7 +41,7 @@ #include "Exception.h" #include "ApplicationSpecialPath.h" //#include "resource.h" -#include "ConfigFormUnit.h" +//#include "ConfigFormUnit.h" #include "TickCount.h" #ifdef IID #undef IID diff --git a/src/core/base/win32/SystemImpl.cpp b/src/core/base/win32/SystemImpl.cpp index 2fa5553c..2410adb4 100644 --- a/src/core/base/win32/SystemImpl.cpp +++ b/src/core/base/win32/SystemImpl.cpp @@ -30,7 +30,7 @@ #include "TVPScreen.h" //#include "CompatibleNativeFuncs.h" #include "DebugIntf.h" -#include "VersionFormUnit.h" +//#include "VersionFormUnit.h" #include "vkdefine.h" #include "ScriptMgnIntf.h" #include "tjsArray.h" diff --git a/src/core/base/win32/win32io.h b/src/core/base/win32/win32io.h index 47dd91e6..1544a73c 100644 --- a/src/core/base/win32/win32io.h +++ b/src/core/base/win32/win32io.h @@ -12,9 +12,6 @@ extern "C" { extern void* valloc(int n); extern void vfree(void *p); extern void logStack(std::string &out); - struct _stat_win32 : public stat {}; - int _stat_win32(const char * _Filename, struct stat * _Stat); -#define stat _stat_win32 FILE * fopen(const char * _Filename, const char * _Mode); } #endif \ No newline at end of file diff --git a/src/core/environ/Application.cpp b/src/core/environ/Application.cpp index 6e12485a..77996f42 100644 --- a/src/core/environ/Application.cpp +++ b/src/core/environ/Application.cpp @@ -21,7 +21,7 @@ #include "Exception.h" //#include "Resource.h" #include "SystemControl.h" -#include "MouseCursor.h" +//#include "MouseCursor.h" #include "SystemImpl.h" #include "WaveImpl.h" #include "GraphicsLoadThread.h" @@ -1026,7 +1026,6 @@ void tTVPApplication::OnActivate() } void tTVPApplication::OnDeactivate( ) { - if (!image_load_thread_) return; // project is not startup yet application_activating_ = false; if (!_project_startup) return; @@ -1056,6 +1055,12 @@ void tTVPApplication::OnExit() CloseConsole(); } +void tTVPApplication::OnLowMemory() +{ + if (!_project_startup) return; + TVPDeliverCompactEvent(TVP_COMPACT_LEVEL_MAX); +} + bool tTVPApplication::GetNotMinimizing() const { return !application_activating_; diff --git a/src/core/environ/Application.h b/src/core/environ/Application.h index 84481128..bc94f20e 100644 --- a/src/core/environ/Application.h +++ b/src/core/environ/Application.h @@ -175,6 +175,7 @@ class tTVPApplication { void OnActivate( ); void OnDeactivate( ); void OnExit(); + void OnLowMemory(); bool GetActivating() const { return application_activating_; } bool GetNotMinimizing() const; diff --git a/src/core/environ/ConfigManager/GlobalConfigManager.cpp b/src/core/environ/ConfigManager/GlobalConfigManager.cpp index 5ec01f49..f1b46222 100644 --- a/src/core/environ/ConfigManager/GlobalConfigManager.cpp +++ b/src/core/environ/ConfigManager/GlobalConfigManager.cpp @@ -2,6 +2,30 @@ #include "tinyxml2/tinyxml2.h" #include "platform/CCFileUtils.h" #include "Platform.h" +#include "UtilStreams.h" +#include "LocaleConfigManager.h" + +bool TVPWriteDataToFile(const ttstr &filepath, const void *data, unsigned int len); +class XMLMemPrinter : public tinyxml2::XMLPrinter { + tTVPMemoryStream _stream; + char _buffer[4096]; +public: + virtual void Print(const char* format, ...) override { + va_list param; + va_start(param, format); + int n = vsnprintf(_buffer, 4096, format, param); + va_end(param); + _stream.Write(_buffer, n); + } + void SaveFile(const std::string &path) { + if (!TVPWriteDataToFile(path, _stream.GetInternalBuffer(), _stream.GetSize())) { + TVPShowSimpleMessageBox( + LocaleConfigManager::GetInstance()->GetText("cannot_create_preference"), + LocaleConfigManager::GetInstance()->GetText("readonly_storage")); + } + } +}; + GlobalConfigManager::GlobalConfigManager() { Initialize(); @@ -68,18 +92,18 @@ void iSysConfigManager::SaveToFile() { } doc.LinkEndChild(rootElement); - FILE *fp = nullptr; -#ifdef _MSC_VER - fp = _wfopen(ttstr(GetFilePath()).c_str(), TJS_W("w")); -#else - fp = fopen(GetFilePath().c_str(), "w"); -#endif - doc.SaveFile(fp); - fclose(fp); - + XMLMemPrinter stream; + doc.Print(&stream); + stream.SaveFile(GetFilePath()); ConfigUpdated = false; } +bool iSysConfigManager::IsValueExist(const std::string &name) +{ + auto it = AllConfig.find(name); + return it != AllConfig.end(); +} + std::string GlobalConfigManager::GetFilePath() { return TVPGetInternalPreferencePath() + "GlobalPreference.xml"; } diff --git a/src/core/environ/ConfigManager/GlobalConfigManager.h b/src/core/environ/ConfigManager/GlobalConfigManager.h index a7fbc33d..f1e85f21 100644 --- a/src/core/environ/ConfigManager/GlobalConfigManager.h +++ b/src/core/environ/ConfigManager/GlobalConfigManager.h @@ -17,6 +17,8 @@ class iSysConfigManager { public: void SaveToFile(); + bool IsValueExist(const std::string &name); + template T GetValue(const std::string &name, const T& defVal); diff --git a/src/core/environ/ConfigManager/IndividualConfigManager.cpp b/src/core/environ/ConfigManager/IndividualConfigManager.cpp index b45c2092..e67bef86 100644 --- a/src/core/environ/ConfigManager/IndividualConfigManager.cpp +++ b/src/core/environ/ConfigManager/IndividualConfigManager.cpp @@ -29,19 +29,19 @@ bool IndividualConfigManager::CheckExistAt(const std::string &folder) { bool IndividualConfigManager::CreatePreferenceAt(const std::string &folder) { std::string fullpath = folder + "/" FILENAME; - FILE *fp = -#ifdef _MSC_VER - _wfopen(ttstr(fullpath).c_str(), TJS_W("w")); -#else - fopen(fullpath.c_str(), "w"); -#endif +// FILE *fp = +// #ifdef _MSC_VER +// _wfopen(ttstr(fullpath).c_str(), TJS_W("w")); +// #else +// fopen(fullpath.c_str(), "w"); +// #endif Clear(); - if (!fp) { - TVPShowSimpleMessageBox( - LocaleConfigManager::GetInstance()->GetText("cannot_create_preference"), - LocaleConfigManager::GetInstance()->GetText("readonly_storage")); - return false; - } +// if (!fp) { +// TVPShowSimpleMessageBox( +// LocaleConfigManager::GetInstance()->GetText("cannot_create_preference"), +// LocaleConfigManager::GetInstance()->GetText("readonly_storage")); +// return false; +// } CurrentPath = fullpath; return true; } diff --git a/src/core/environ/android/AndroidUtils.cpp b/src/core/environ/android/AndroidUtils.cpp index 6db43ea7..049e66cd 100644 --- a/src/core/environ/android/AndroidUtils.cpp +++ b/src/core/environ/android/AndroidUtils.cpp @@ -596,6 +596,7 @@ void TVPForceSwapBuffer() { } bool TVPCheckStartupPath(const std::string &path) { + // check writing permission first int pos = path.find_last_of('/'); if (pos == path.npos) return false; std::string parent = path.substr(0, pos); @@ -605,20 +606,17 @@ bool TVPCheckStartupPath(const std::string &path) { jstring jstrPath = methodInfo.env->NewStringUTF(parent.c_str()); success = methodInfo.env->CallStaticBooleanMethod(methodInfo.classID, methodInfo.methodID, jstrPath); methodInfo.env->DeleteLocalRef(jstrPath); + if (success) { + parent += "/savedata"; + if (!TVPCheckExistentLocalFolder(parent)) { + TVPCreateFolders(parent); + } + jstrPath = methodInfo.env->NewStringUTF(parent.c_str()); + success = methodInfo.env->CallStaticBooleanMethod(methodInfo.classID, methodInfo.methodID, jstrPath); + methodInfo.env->DeleteLocalRef(jstrPath); + } } - //pos = parent.find_last_of('/'); - //if (pos == parent.npos) return false; - //std::string parentName = parent.substr(pos + 1); -// FILE* fp; -// std::string testfile = parent + "/.___test_for_kr2_write"; -// fp = fopen(testfile.c_str(), "wb"); -// bool success = false; -// if (fp) { -// fclose(fp); -// success = !remove(testfile.c_str()); -// } - if (!success) { std::vector paths; paths.emplace_back(GetInternalStoragePath()); @@ -630,9 +628,33 @@ bool TVPCheckStartupPath(const std::string &path) { msg = msg.replace(msg.begin() + pos, msg.begin() + pos + 2, paths.back()); } } - int result = TVPShowSimpleMessageBoxYesNo(msg, LocaleConfigManager::GetInstance()->GetText("readonly_storage")); - return result == 0; + std::vector btns; + btns.push_back(LocaleConfigManager::GetInstance()->GetText("continue_run")); + JNIEnv *pEnv = JniHelper::getEnv(); + jclass classID = pEnv->FindClass("android/os/Build$VERSION"); + jfieldID idSDK_INT = methodInfo.env->GetStaticFieldID(classID, "SDK_INT", "I"); + jint sdkid = pEnv->GetStaticIntField(classID, idSDK_INT); + bool isLOLLIPOP = sdkid >= 21; + if (isLOLLIPOP) + btns.push_back(LocaleConfigManager::GetInstance()->GetText("get_sdcard_permission")); + else + btns.push_back(LocaleConfigManager::GetInstance()->GetText("cancel")); + int result = TVPShowSimpleMessageBox(msg, LocaleConfigManager::GetInstance()->GetText("readonly_storage"), btns); + if (isLOLLIPOP && result == 1) { + if (JniHelper::getStaticMethodInfo(methodInfo, "org/tvp/kirikiri2/KR2Activity", "requireLEXA", "(Ljava/lang/String;)V")) { + jstring jstrPath = methodInfo.env->NewStringUTF(paths.back().c_str()); + methodInfo.env->CallStaticVoidMethod(methodInfo.classID, methodInfo.methodID, jstrPath); + methodInfo.env->DeleteLocalRef(jstrPath); + } + } + if (result != 0) + return false; } + + // check adreno GPU issue +// if (IndividualConfigManager::GetInstance()->GetValue("renderer", "software") == "opengl") { +// TVPOnOpenGLRendererSelected(false); +// } return true; } diff --git a/src/core/environ/cocos2d/MainScene.cpp b/src/core/environ/cocos2d/MainScene.cpp index d592689b..b3a38d57 100644 --- a/src/core/environ/cocos2d/MainScene.cpp +++ b/src/core/environ/cocos2d/MainScene.cpp @@ -115,7 +115,7 @@ int TVPDrawSceneOnce(int interval) { static tjs_uint64 lastTick = TVPGetRoughTickCount32(); tjs_uint64 curTick = TVPGetRoughTickCount32(); int remain = interval - (curTick - lastTick); - if (remain < 0) { + if (remain <= 0) { if (_postUpdate) _postUpdate(); Director* director = Director::getInstance(); director->drawScene(/*true*/); @@ -350,7 +350,7 @@ class TVPWindowLayer : public cocos2d::extension::ScrollView, public iWindowLaye TVPWindowLayer *_prevWindow, *_nextWindow; friend class TVPWindowManagerOverlay; friend class TVPMainScene; - int _LastMouseX, _LastMouseY; + int _LastMouseX = 0, _LastMouseY = 0; std::string _caption; // std::map _AllOverlay; float _drawSpriteScaleX = 1.0f, _drawSpriteScaleY = 1.0f; @@ -861,9 +861,8 @@ class TVPWindowLayer : public cocos2d::extension::ScrollView, public iWindowLaye RecalcPaintBox(); } - virtual void UpdateDrawBuffer(const iTVPBaseBitmap *buf) { - if (!buf) return; - iTVPTexture2D *tex = buf->GetTexture(); + virtual void UpdateDrawBuffer(iTVPTexture2D *tex) { + if (!tex) return; // iTVPRenderManager *mgr = TVPGetRenderManager(); // if (!mgr->IsSoftware()) { // static iTVPRenderMethod *method = TVPGetRenderManager()->GetRenderMethod("CopyOpaqueImage"); @@ -1268,6 +1267,7 @@ class TVPWindowLayer : public cocos2d::extension::ScrollView, public iWindowLaye } else { CanCloseWork = true; TVPPostEvent(obj, obj, eventname, 0, TVP_EPT_IMMEDIATE, 1, arg); + TVPDrawSceneOnce(0); // for post event // this event happens immediately // and does not return until done return CanCloseWork; // CanCloseWork is set by the event handler @@ -1893,10 +1893,11 @@ void TVPMainScene::onKeyPressed(EventKeyboard::KeyCode keyCode, Event* event) { case EventKeyboard::KeyCode::KEY_F12: if (TVPGetCurrentShiftKeyState() & ssShift) { std::vector btns({ "OK", "Cancel" }); - ttstr text; + ttstr text; tTJSVariant result; if (TVPShowSimpleInputBox(text, "console command", "", btns) == 0) { - TVPExecuteExpression(text); + TVPExecuteExpression(text, &result); } + result = text; } break; #endif diff --git a/src/core/environ/typedefine.h b/src/core/environ/typedefine.h index 0935f9da..481ad34b 100644 --- a/src/core/environ/typedefine.h +++ b/src/core/environ/typedefine.h @@ -160,6 +160,7 @@ typedef struct { } RECT, RECTL; typedef intptr_t LONG_PTR; typedef LONG HRESULT; +#define TYPE_GUID_DEFINED typedef struct { DWORD Data1; WORD Data2; diff --git a/src/core/environ/ui/FileSelectorForm.cpp b/src/core/environ/ui/FileSelectorForm.cpp index a51307df..7292b504 100644 --- a/src/core/environ/ui/FileSelectorForm.cpp +++ b/src/core/environ/ui/FileSelectorForm.cpp @@ -119,7 +119,7 @@ void TVPBaseFileSelectorForm::ListDir(std::string path) { if (_title) { #if CC_PLATFORM_WIN32 == CC_TARGET_PLATFORM // for better screenshot - _title->setTitleFontName("SIMHEI"); + _title->setTitleFontName("SIMHEI.ttf"); if (!split_path.second.empty() && (split_path.second.back() == '/' || split_path.second.back() == '\\')) { split_path.second.pop_back(); } @@ -513,7 +513,7 @@ void TVPBaseFileSelectorForm::FileItemCellImpl::initFromFile(const char * filena CellTextAreaSize.height = 0; OrigCellTextSize = FileNameNode->getContentSize(); #if CC_PLATFORM_WIN32 == CC_TARGET_PLATFORM - FileNameNode->setFontName("SIMHEI"); + FileNameNode->setFontName("SIMHEI.ttf"); #endif } static const std::string str_highlight("highlight"); diff --git a/src/core/environ/ui/GlobalPreferenceForm.cpp b/src/core/environ/ui/GlobalPreferenceForm.cpp index 25f745c4..723da9a1 100644 --- a/src/core/environ/ui/GlobalPreferenceForm.cpp +++ b/src/core/environ/ui/GlobalPreferenceForm.cpp @@ -44,7 +44,16 @@ void TVPGlobalPreferenceForm::Initialize() static bool Inited = false; if (!Inited) { Inited = true; + if (!GlobalConfigManager::GetInstance()->IsValueExist("GL_EXT_shader_framebuffer_fetch")) { + // disable GL_EXT_shader_framebuffer_fetch normally for adreno GPU + if (strstr((const char*)glGetString(GL_RENDERER), "Adreno")) { + GlobalConfigManager::GetInstance()->SetValueInt("GL_EXT_shader_framebuffer_fetch", 0); + } + } + initAllConfig(); WalkConfig(&RootPreference); + WalkConfig(&SoftRendererOptPreference); + WalkConfig(&OpenglOptPreference); } } diff --git a/src/core/environ/ui/MainFileSelectorForm.cpp b/src/core/environ/ui/MainFileSelectorForm.cpp index 4a3996a9..00c862df 100644 --- a/src/core/environ/ui/MainFileSelectorForm.cpp +++ b/src/core/environ/ui/MainFileSelectorForm.cpp @@ -254,12 +254,6 @@ void TVPMainFileSelectorForm::doStartup(const std::string &path) { if (TVPMainScene::GetInstance()->startupFrom(path)) { if (GlobalConfigManager::GetInstance()->GetValue("remember_last_path", true)) { _AddHistory(path); -// std::string lastpath_file = _getLastPathFilePath(); -// FILE* fp = fopen(lastpath_file.c_str(), "wt"); -// if (fp) { -// fwrite(path.c_str(), 1, path.size(), fp); -// fclose(fp); -// } } } } @@ -333,43 +327,54 @@ void TVPMainFileSelectorForm::showMenu(Ref*) { reader.findWidget("btnHelp")->addClickEventListener([this](Ref*) { TVPTipsHelpForm::show(); }); - reader.findWidget("btnAbout")->addClickEventListener([](Ref*) { - std::string versionText = "Version "; - versionText += TVPGetPackageVersionString(); - versionText += "\n"; - versionText += LocaleConfigManager::GetInstance()->GetText("about_content"); - - const char * pszBtnText[] = { - LocaleConfigManager::GetInstance()->GetText("ok").c_str(), - LocaleConfigManager::GetInstance()->GetText("device_info").c_str(), - }; - - std::string strCaption = LocaleConfigManager::GetInstance()->GetText("menu_about"); - int n = TVPShowSimpleMessageBox(versionText.c_str(), strCaption.c_str(), - sizeof(pszBtnText) / sizeof(pszBtnText[0]), pszBtnText); - - switch (n) { - case 1: { - std::string text = TVPGetOpenGLInfo(); - const char *pOK = LocaleConfigManager::GetInstance()->GetText("ok").c_str(); - TVPShowSimpleMessageBox(text.c_str(), + bool showSimpleAbout = false; + if(showSimpleAbout) { + reader.findWidget("btnAbout")->addClickEventListener([](Ref*) { + std::string versionText = "Version "; + versionText += TVPGetPackageVersionString(); + + std::string btnText = LocaleConfigManager::GetInstance()->GetText("ok"); + const char *pszBtnText = btnText.c_str(); + std::string strCaption = LocaleConfigManager::GetInstance()->GetText("menu_about"); + const char *caption = strCaption.c_str(); + TVPShowSimpleMessageBox(versionText.c_str(), caption, 1, &pszBtnText); + }); + reader.findWidget("btnExit")->addClickEventListener([](Ref*) { + if (TVPShowSimpleMessageBoxYesNo( + LocaleConfigManager::GetInstance()->GetText("sure_to_exit"), + "XP3Player") == 0) TVPExitApplication(0); + }); + } else { + reader.findWidget("btnAbout")->addClickEventListener([](Ref*) { + std::string versionText = "Version "; + versionText += TVPGetPackageVersionString(); + versionText += "\n"; + versionText += LocaleConfigManager::GetInstance()->GetText("about_content"); + + const char * pszBtnText[] = { + LocaleConfigManager::GetInstance()->GetText("ok").c_str(), LocaleConfigManager::GetInstance()->GetText("device_info").c_str(), - 1, &pOK); - } break; - } - }); - reader.findWidget("btnExit")->addClickEventListener([](Ref*) { - _AskExit(); -// TVPMessageBoxForm::showYesNo("Kirikiroid2", -// LocaleConfigManager::GetInstance()->GetText("sure_to_exit"), [](int n) { -// if (n == 0) TVPExitApplication(0); -// }); -// TVPMessageBoxForm::showYesNo("Kirikiroid2", -// LocaleConfigManager::GetInstance()->GetText("sure_to_exit"), [](int n) { -// if (n == 0) TVPExitApplication(0); -// }); - }); -#endif + }; + + std::string strCaption = LocaleConfigManager::GetInstance()->GetText("menu_about"); + int n = TVPShowSimpleMessageBox(versionText.c_str(), strCaption.c_str(), + sizeof(pszBtnText) / sizeof(pszBtnText[0]), pszBtnText); + + switch (n) { + case 1: { + std::string text = TVPGetOpenGLInfo(); + const char *pOK = LocaleConfigManager::GetInstance()->GetText("ok").c_str(); + TVPShowSimpleMessageBox(text.c_str(), + LocaleConfigManager::GetInstance()->GetText("device_info").c_str(), + 1, &pOK); + } break; + } + }); + reader.findWidget("btnExit")->addClickEventListener([](Ref*) { + _AskExit(); + }); + } + } const Size &uiSize = getContentSize(); const Vec2 &pos = _menu->getPosition(); diff --git a/src/core/environ/ui/PreferenceConfig.h b/src/core/environ/ui/PreferenceConfig.h index aa99ddb3..401a6971 100644 --- a/src/core/environ/ui/PreferenceConfig.h +++ b/src/core/environ/ui/PreferenceConfig.h @@ -2,6 +2,15 @@ static tPreferenceScreen RootPreference; static tPreferenceScreen OpenglOptPreference, SoftRendererOptPreference; static Size PrefListSize; +class tTVPPreferenceInfoConstant : public iTVPPreferenceInfo { +public: + tTVPPreferenceInfoConstant(const std::string &cap) : iTVPPreferenceInfo(cap, "") {} + virtual iPreferenceItem *createItem() override { + LocaleConfigManager *locmgr = LocaleConfigManager::GetInstance(); + return CreatePreferenceItem(PrefListSize, locmgr->GetText(Caption)); + } +}; + class tTVPPreferenceInfoCheckBox : public tTVPPreferenceInfo { public: tTVPPreferenceInfoCheckBox(const std::string &cap, const std::string &key, bool defval) @@ -42,18 +51,18 @@ class tTVPPreferenceInfoSelectList : public tTVPPreferenceInfo, tPr std::vector > ListInfo; }; -class tTVPPreferenceInfoSelectRenderer : public tTVPPreferenceInfoSelectList { - typedef tTVPPreferenceInfoSelectList inherit; -public: - tTVPPreferenceInfoSelectRenderer(const std::string &cap, const std::string &key, const std::string &defval, - const std::initializer_list > &listinfo) : inherit(cap, key, defval, listinfo) {} - virtual void onSetValue(const std::string &v) { - inherit::onSetValue(v); - if (v == "opengl") { - TVPOnOpenGLRendererSelected(); - } - } -}; +// class tTVPPreferenceInfoSelectRenderer : public tTVPPreferenceInfoSelectList { +// typedef tTVPPreferenceInfoSelectList inherit; +// public: +// tTVPPreferenceInfoSelectRenderer(const std::string &cap, const std::string &key, const std::string &defval, +// const std::initializer_list > &listinfo) : inherit(cap, key, defval, listinfo) {} +// virtual void onSetValue(const std::string &v) { +// inherit::onSetValue(v); +// if (v == "opengl") { +// TVPOnOpenGLRendererSelected(true); +// } +// } +// }; class tTVPPreferenceInfoSelectFile : public tTVPPreferenceInfo { public: @@ -160,13 +169,17 @@ static void initAllConfig() { RootPreference.Preferences = { new tTVPPreferenceInfoCheckBox("preference_output_log", "outputlog", true), new tTVPPreferenceInfoCheckBox("preference_show_fps", "showfps", false), - new tTVPPreferenceInfoSelectRenderer("preference_select_renderer", "renderer", "software", { + new tTVPPreferenceInfoSelectList("preference_select_renderer", "renderer", "software", { { "preference_opengl", "opengl" }, { "preference_software", "software" } }), new tTVPPreferenceInfoRendererSubPref("preference_renderer_opt"), new tTVPPreferenceInfoSelectFile("preference_default_font", "default_font", ""), +#ifdef CC_TARGET_OS_IPHONE + new tTVPPreferenceInfoSelectList("preference_mem_limit", "memusage", "high", { +#else new tTVPPreferenceInfoSelectList("preference_mem_limit", "memusage", "unlimited", { +#endif { "preference_mem_unlimited", "unlimited" }, { "preference_mem_high", "high" }, { "preference_mem_medium", "medium" }, @@ -211,9 +224,14 @@ static void initAllConfig() { OpenglOptPreference.Title = "preference_opengl_renderer_opt"; OpenglOptPreference.Preferences = { new tTVPPreferenceInfoSubPref("preference_opengl_extension_opt", { + new tTVPPreferenceInfoConstant("preference_opengl_extension_desc"), +#ifdef CC_TARGET_OS_IPHONE new tTVPPreferenceInfoCheckBox("GL_EXT_shader_framebuffer_fetch", "GL_EXT_shader_framebuffer_fetch", true), +#else + new tTVPPreferenceInfoCheckBox("GL_EXT_shader_framebuffer_fetch", "GL_EXT_shader_framebuffer_fetch", false), +#endif new tTVPPreferenceInfoCheckBox("GL_ARM_shader_framebuffer_fetch", "GL_ARM_shader_framebuffer_fetch", true), - new tTVPPreferenceInfoCheckBox("GL_ARM_shader_framebuffer_fetch", "GL_NV_shader_framebuffer_fetch", true), + new tTVPPreferenceInfoCheckBox("GL_NV_shader_framebuffer_fetch", "GL_NV_shader_framebuffer_fetch", true), new tTVPPreferenceInfoCheckBox("GL_EXT_copy_image", "GL_EXT_copy_image", false), new tTVPPreferenceInfoCheckBox("GL_OES_copy_image", "GL_OES_copy_image", false), new tTVPPreferenceInfoCheckBox("GL_ARB_copy_image", "GL_ARB_copy_image", false), diff --git a/src/core/environ/ui/PreferenceForm.cpp b/src/core/environ/ui/PreferenceForm.cpp index 23850b85..0215f21a 100644 --- a/src/core/environ/ui/PreferenceForm.cpp +++ b/src/core/environ/ui/PreferenceForm.cpp @@ -108,6 +108,31 @@ void tPreferenceItemCheckBox::onPressStateChangedToPressed() { void tPreferenceItemConstant::initController(const NodeMap &allNodes) { highlight = allNodes.findController("highlight"); allNodes.findController("dir_icon")->setVisible(false); + Size origSize = _title->getContentSize(); + _title->setTextAreaSize(Size::ZERO); + std::string s = _title->getString(); + Size sizeTmp = _title->getVirtualRendererSize(); + float addHeight = 0; + if (sizeTmp.width < origSize.width) { // single line + sizeTmp.width = origSize.width;; + sizeTmp.height = 0; + _title->setTextAreaSize(sizeTmp); + addHeight = _title->getVirtualRendererSize().height - origSize.height; + if (addHeight < 0) addHeight = 0; + } else { // multi line + sizeTmp.width = origSize.width;; + sizeTmp.height = 0; + _title->setTextAreaSize(sizeTmp); + sizeTmp = _title->getVirtualRendererSize(); + _title->setContentSize(sizeTmp); + addHeight = sizeTmp.height - origSize.height; + } + Node *root = getChildren().front(); + sizeTmp = root->getContentSize(); + sizeTmp.height += addHeight; + root->setContentSize(sizeTmp); + ui::Helper::doLayout(root); + setContentSize(sizeTmp); } const char* tPreferenceItemSubDir::getUIFileName() const { diff --git a/src/core/environ/ui/PreferenceForm.h b/src/core/environ/ui/PreferenceForm.h index cbe94f51..c61cb04b 100644 --- a/src/core/environ/ui/PreferenceForm.h +++ b/src/core/environ/ui/PreferenceForm.h @@ -3,8 +3,6 @@ #include "ui/UIWidget.h" #include "ConfigManager/GlobalConfigManager.h" -void TVPOnOpenGLRendererSelected(); - namespace tinyxml2 { class XMLElement; } diff --git a/src/core/environ/win32/Platform.cpp b/src/core/environ/win32/Platform.cpp index 38642e15..7bb904a2 100644 --- a/src/core/environ/win32/Platform.cpp +++ b/src/core/environ/win32/Platform.cpp @@ -1,5 +1,5 @@ #include "Platform.h" -#include "cocos2d\MainScene.h" +#include "cocos2d/MainScene.h" //#undef WIN32 #include #include @@ -11,6 +11,8 @@ #include #include "Application.h" #include "EventIntf.h" +#include "cocos/base/CCDirector.h" +#include #pragma comment(lib,"psapi.lib") @@ -68,19 +70,9 @@ extern "C" int usleep(unsigned long us) { return 0; } -extern "C" __declspec(dllimport) int __cdecl __wgetmainargs(int * _Argc, wchar_t *** _Argv, wchar_t *** _Env, int _DoWildCard, void * _StartInfo); +//extern "C" __declspec(dllimport) int __cdecl __wgetmainargs(int * _Argc, wchar_t *** _Argv, wchar_t *** _Env, int _DoWildCard, void * _StartInfo); std::wstring_convert> converter; std::string TVPGetDefaultFileDir() { - wchar_t **argv, **env; - int argc; - struct - { - int newmode; - } info = { 0 }; - __wgetmainargs(&argc, &argv, &env, 0, &info); -// if (argc > 1) { -// return converter.to_bytes(argv[1]); -// } wchar_t buf[MAX_PATH]; _wgetcwd(buf, sizeof(buf) / sizeof(buf[0])); wchar_t *p = buf; @@ -94,13 +86,14 @@ std::string TVPGetDefaultFileDir() { int TVPCheckArchive(const ttstr &localname); void TVPCheckAndSendDumps(const std::string &dumpdir, const std::string &packageName, const std::string &versionStr); bool TVPCheckStartupArg() { - wchar_t **argv, **env; - int argc; + wchar_t **argv = __wargv, **env; + int argc = __argc; struct { int newmode; } info = { 0 }; - __wgetmainargs(&argc, &argv, &env, 0, &info); + argv = CommandLineToArgvW(GetCommandLineW(), &argc); +// __wgetmainargs(&argc, &argv, &env, 0, &info); TVPCheckAndSendDumps(TVPGetDefaultFileDir() + "/dumps", "win32-test", "test"); if (argc > 1) { std::wstring_convert> converter; diff --git a/src/core/environ/win32/SystemControl.cpp b/src/core/environ/win32/SystemControl.cpp index 3b09e938..b9490519 100644 --- a/src/core/environ/win32/SystemControl.cpp +++ b/src/core/environ/win32/SystemControl.cpp @@ -14,7 +14,7 @@ #include "StorageIntf.h" #include "EmergencyExit.h" // for TVPCPUClock #include "DebugIntf.h" -#include "VersionFormUnit.h" +//#include "VersionFormUnit.h" #include "WaveImpl.h" #include "SystemImpl.h" #include "UserEvent.h" @@ -112,14 +112,12 @@ bool tTVPSystemControl::ApplicationIdle() { return cont; } -void stop_profile(); void tTVPSystemControl::DeliverEvents() { if(ContinuousEventCalling) TVPProcessContinuousHandlerEventFlag = true; // set flag if (EventEnable) { TVPDeliverAllEvents(); - stop_profile(); } } diff --git a/src/core/environ/win32/TVPWindow.h b/src/core/environ/win32/TVPWindow.h index 6ce3e140..ff79efd3 100644 --- a/src/core/environ/win32/TVPWindow.h +++ b/src/core/environ/win32/TVPWindow.h @@ -350,7 +350,7 @@ class iWindowLayer { virtual tjs_int GetHeight() const = 0; virtual void GetWinSize(tjs_int &w, tjs_int &h) = 0; virtual void SetZoom(tjs_int numer, tjs_int denom) = 0; - virtual void UpdateDrawBuffer(const iTVPBaseBitmap *buf) = 0; + virtual void UpdateDrawBuffer(iTVPTexture2D *tex) = 0; #if 0 virtual void AddOverlay(tTJSNI_BaseVideoOverlay *ovl) = 0; virtual void RemoveOverlay(tTJSNI_BaseVideoOverlay *ovl) = 0; diff --git a/src/core/movie/ffmpeg/VideoCodecFFmpeg.cpp b/src/core/movie/ffmpeg/VideoCodecFFmpeg.cpp index 2c4660fb..e590002d 100644 --- a/src/core/movie/ffmpeg/VideoCodecFFmpeg.cpp +++ b/src/core/movie/ffmpeg/VideoCodecFFmpeg.cpp @@ -793,7 +793,7 @@ bool CDVDVideoCodecFFmpeg::GetPictureCommon(DVDVideoPicture* pDvdVideoPicture) pDvdVideoPicture->iWidth = m_pFrame->width; pDvdVideoPicture->iHeight = m_pFrame->height; - +#if 0 /* crop of 10 pixels if demuxer asked it */ if(m_pCodecContext->coded_width && m_pCodecContext->coded_width < (int)pDvdVideoPicture->iWidth && m_pCodecContext->coded_width > (int)pDvdVideoPicture->iWidth - 10) @@ -802,7 +802,7 @@ bool CDVDVideoCodecFFmpeg::GetPictureCommon(DVDVideoPicture* pDvdVideoPicture) if(m_pCodecContext->coded_height && m_pCodecContext->coded_height < (int)pDvdVideoPicture->iHeight && m_pCodecContext->coded_height > (int)pDvdVideoPicture->iHeight - 10) pDvdVideoPicture->iHeight = m_pCodecContext->coded_height; - +#endif double aspect_ratio; /* use variable in the frame */ @@ -915,25 +915,36 @@ bool CDVDVideoCodecFFmpeg::GetPictureCommon(DVDVideoPicture* pDvdVideoPicture) bool CDVDVideoCodecFFmpeg::GetPicture(DVDVideoPicture* pDvdVideoPicture) { - if (m_pHardware) - return m_pHardware->GetPicture(m_pCodecContext, m_pFrame, pDvdVideoPicture); - - if (!GetPictureCommon(pDvdVideoPicture)) - return false; - - for (int i = 0; i < 4; i++) - pDvdVideoPicture->data[i] = m_pFrame->data[i]; - for (int i = 0; i < 4; i++) - pDvdVideoPicture->iLineSize[i] = m_pFrame->linesize[i]; - - pDvdVideoPicture->iFlags |= pDvdVideoPicture->data[0] ? 0 : DVP_FLAG_DROPPED; - pDvdVideoPicture->extended_format = 0; - - AVPixelFormat pix_fmt; - pix_fmt = (AVPixelFormat)m_pFrame->format; - - pDvdVideoPicture->format = CDVDCodecUtils::EFormatFromPixfmt(pix_fmt); - return true; + if (m_pHardware) + return m_pHardware->GetPicture(m_pCodecContext, m_pFrame, pDvdVideoPicture); + + AVPixelFormat pix_fmt; + pix_fmt = (AVPixelFormat)m_pFrame->format; + pDvdVideoPicture->format = CDVDCodecUtils::EFormatFromPixfmt(pix_fmt); + + while (m_pCodecContext->coded_width > 0 && m_pCodecContext->coded_height > 0) { + if (pDvdVideoPicture->format == RENDER_FMT_YUV420P) { + int pitch = m_pFrame->linesize[0]; + if (pitch < m_pCodecContext->coded_width || pitch > m_pCodecContext->coded_width + 16) + break; + } + m_pFrame->width = m_pCodecContext->coded_width; + m_pFrame->height = m_pCodecContext->coded_height; + break; + } + + if (!GetPictureCommon(pDvdVideoPicture)) + return false; + + for (int i = 0; i < 4; i++) + pDvdVideoPicture->data[i] = m_pFrame->data[i]; + for (int i = 0; i < 4; i++) + pDvdVideoPicture->iLineSize[i] = m_pFrame->linesize[i]; + + pDvdVideoPicture->iFlags |= pDvdVideoPicture->data[0] ? 0 : DVP_FLAG_DROPPED; + pDvdVideoPicture->extended_format = 0; + + return true; } int CDVDVideoCodecFFmpeg::FilterOpen(const std::string& filters, bool scale) diff --git a/src/core/movie/ffmpeg/VideoPlayer.cpp b/src/core/movie/ffmpeg/VideoPlayer.cpp index 89008ca5..4be9014b 100644 --- a/src/core/movie/ffmpeg/VideoPlayer.cpp +++ b/src/core/movie/ffmpeg/VideoPlayer.cpp @@ -11,6 +11,7 @@ #include "tjsConfig.h" #include "Application.h" #include +#include NS_KRMOVIE_BEGIN @@ -195,6 +196,7 @@ BasePlayer::BasePlayer(CBaseRenderer *renderer) m_streamPlayerSpeed = DVD_PLAYSPEED_NORMAL; m_caching = CACHESTATE_DONE; memset(&m_SpeedState, 0, sizeof(m_SpeedState)); +#if 0 ::Application->RegisterActiveEvent(this, [](void* p, eTVPActiveEvent ev){ switch (ev) { case eTVPActiveEvent::onActive: @@ -205,6 +207,7 @@ BasePlayer::BasePlayer(CBaseRenderer *renderer) break; } }); +#endif } BasePlayer::~BasePlayer() { diff --git a/src/core/movie/ffmpeg/VideoPlayerAudio.cpp b/src/core/movie/ffmpeg/VideoPlayerAudio.cpp index 52bb5717..48851030 100644 --- a/src/core/movie/ffmpeg/VideoPlayerAudio.cpp +++ b/src/core/movie/ffmpeg/VideoPlayerAudio.cpp @@ -447,7 +447,7 @@ void CVideoPlayerAudio::Process() msg.cachetime = cachetime; msg.timestamp = audioframe.hasTimestamp ? audioframe.pts : DVD_NOPTS_VALUE; m_messageParent.Put(new CDVDMsgType(CDVDMsg::PLAYER_STARTED, msg)); - +#if 0 if (consumed < pPacket->iSize) { pPacket->iSize -= consumed; @@ -455,6 +455,7 @@ void CVideoPlayerAudio::Process() m_messageQueue.Put(pMsg->AddRef(), 0, false); break; } +#endif } } } @@ -462,13 +463,18 @@ void CVideoPlayerAudio::Process() // guess next pts m_audioClock += audioframe.duration; - int ret = m_pAudioCodec->Decode(nullptr, 0, DVD_NOPTS_VALUE, DVD_NOPTS_VALUE); + if (consumed >= pPacket->iSize) + break; + int ret = m_pAudioCodec->Decode(pPacket->pData + consumed, pPacket->iSize - consumed, DVD_NOPTS_VALUE, DVD_NOPTS_VALUE); if (ret < 0) { // CLog::Log(LOGERROR, "CVideoPlayerAudio::DecodeFrame - Decode Error. Skipping audio packet (%d)", ret); m_pAudioCodec->Reset(); break; } + else { + consumed += ret; + } } // while decoder produces output } // demuxer packet diff --git a/src/core/tjs2/tjsByteCodeLoader.cpp b/src/core/tjs2/tjsByteCodeLoader.cpp index 18599bf4..7f2979ce 100644 --- a/src/core/tjs2/tjsByteCodeLoader.cpp +++ b/src/core/tjs2/tjsByteCodeLoader.cpp @@ -217,7 +217,7 @@ void tTJSByteCodeLoader::ReadObjects( tTJSScriptBlock* block, const tjs_uint8* b count = read4byte( &(buff[offset]) ); const tjs_int codeSize = count; offset += 4; - tjs_int32* code = new tjs_int32[count]; + tjs_int32* code = (tjs_int32*)TJS_malloc(count * sizeof(tjs_int32)); for( int i = 0; i < count; i++ ) { tjs_int16 c = (tjs_int16)read2byte( &(buff[offset]) ); code[i] = c; diff --git a/src/core/tjs2/tjsConfig.cpp b/src/core/tjs2/tjsConfig.cpp index 49e9ac00..9c7063f9 100644 --- a/src/core/tjs2/tjsConfig.cpp +++ b/src/core/tjs2/tjsConfig.cpp @@ -21,6 +21,7 @@ #define isfinite std::isfinite #endif #define INTMAX_MAX 0x7fffffffffffffff +#include /* * core/utils/cp932_uni.cpp diff --git a/src/core/tjs2/tjsUtils.h b/src/core/tjs2/tjsUtils.h index f06d8a61..5b3825c9 100644 --- a/src/core/tjs2/tjsUtils.h +++ b/src/core/tjs2/tjsUtils.h @@ -196,6 +196,36 @@ class tRefHolder T* operator->() const { return get(); } }; +template +class tRefPtr +{ +private: + T *_ptr; +public: + tRefPtr() : _ptr(nullptr) {} + tRefPtr(T* p) : _ptr(nullptr) { if (p) p->AddRef(), _ptr = p; } + tRefPtr(const tRefPtr &ref) { _ptr = ref._ptr; if (_ptr) _ptr->AddRef(); } + ~tRefPtr() { if (_ptr) _ptr->Release(); } + const tRefPtr & operator = (const tRefPtr & rhs) { + if (rhs._ptr != _ptr) { + if (_ptr)_ptr->Release(); + _ptr = rhs._ptr; + if (_ptr) _ptr->AddRef(); + } + return *this; + } + const tRefPtr & operator = (T *rhs) { + if (rhs != _ptr) { + if (_ptr)_ptr->Release(); + _ptr = rhs; + if (_ptr) _ptr->AddRef(); + } + return *this; + } + T* get() const { return _ptr; } + operator T*() const { return get(); } + T* operator->() const { return get(); } +}; /*]*/ diff --git a/src/core/utils/win32/ThreadImpl.cpp b/src/core/utils/win32/ThreadImpl.cpp index 0b801a06..f1f58614 100644 --- a/src/core/utils/win32/ThreadImpl.cpp +++ b/src/core/utils/win32/ThreadImpl.cpp @@ -21,7 +21,7 @@ #if defined(CC_TARGET_OS_IPHONE) || defined(__aarch64__) #else -#define USING_THREADPOOL11 +//#define USING_THREADPOOL11 #endif #ifdef USING_THREADPOOL11 @@ -135,6 +135,7 @@ void tTVPThread::Resume() //--------------------------------------------------------------------------- void tTVPThreadEvent::Set() { + std::unique_lock lk(Mutex); Handle.notify_one(); } //--------------------------------------------------------------------------- @@ -193,7 +194,7 @@ tjs_int TVPGetThreadNum(void) //--------------------------------------------------------------------------- void TVPExecThreadTask(int numThreads, TVP_THREAD_TASK_FUNC func) { - if (TVPThreadTaskCount >= TVPThreadTaskNum - 1) { + if (numThreads == 1) { func(0); return; } diff --git a/src/core/visual/ARM/AddBlend.h b/src/core/visual/ARM/AddBlend.h deleted file mode 100644 index f8f560fa..00000000 --- a/src/core/visual/ARM/AddBlend.h +++ /dev/null @@ -1,83 +0,0 @@ -static void _CAT_NAME(FUNC_NAME, _NEON)(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len -#ifdef BLEND_WITH_OPACITY - , tjs_int opa -#endif - ) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - C_FUNC_NAME(dest, src, PreFragLen -#ifdef BLEND_WITH_OPACITY - , opa -#endif - ); - dest += PreFragLen; - src += PreFragLen; - } - } - -#ifdef BLEND_WITH_OPACITY - uint8x8_t opa8 = vdup_n_u8(opa); -#endif -#ifdef BLEND_WITH_OPACITY - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; -#else - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-15; -#endif - while(dest < pVecEndDst) { -#ifdef BLEND_WITH_OPACITY - uint8x8x4_t s_argb8 = vld4_u8((unsigned char*)src); -#ifdef SUB_FUNC - s_argb8.val[2] = vmvn_u8(s_argb8.val[2]); - s_argb8.val[1] = vmvn_u8(s_argb8.val[1]); - s_argb8.val[0] = vmvn_u8(s_argb8.val[0]); -#endif - uint16x8_t s_r16 = vmull_u8(s_argb8.val[2], opa8); - uint16x8_t s_g16 = vmull_u8(s_argb8.val[1], opa8); - uint16x8_t s_b16 = vmull_u8(s_argb8.val[0], opa8); - s_argb8.val[2] = vshrn_n_u16(s_r16, 8); - s_argb8.val[1] = vshrn_n_u16(s_g16, 8); - s_argb8.val[0] = vshrn_n_u16(s_b16, 8); - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - d_argb8.val[2] = OP_FUNC(d_argb8.val[2], s_argb8.val[2]); - d_argb8.val[1] = OP_FUNC(d_argb8.val[1], s_argb8.val[1]); - d_argb8.val[0] = OP_FUNC(d_argb8.val[0], s_argb8.val[0]); - - vst4_u8((unsigned char *)dest, d_argb8); - dest += 8; - src += 8; -#else // normal add-blend - uint8x16x4_t s_argb8 = vld4q_u8((unsigned char*)src); -#ifdef SUB_FUNC -#ifndef HOLD_DEST_ALPHA - s_argb8.val[3] = vmvnq_u8(s_argb8.val[3]); -#endif - s_argb8.val[2] = vmvnq_u8(s_argb8.val[2]); - s_argb8.val[1] = vmvnq_u8(s_argb8.val[1]); - s_argb8.val[0] = vmvnq_u8(s_argb8.val[0]); -#endif - uint8x16x4_t d_argb8 = vld4q_u8((unsigned char*)dest); -#ifndef HOLD_DEST_ALPHA - d_argb8.val[3] = OP_FUNC(d_argb8.val[3], s_argb8.val[3]); -#endif - d_argb8.val[2] = OP_FUNC(d_argb8.val[2], s_argb8.val[2]); - d_argb8.val[1] = OP_FUNC(d_argb8.val[1], s_argb8.val[1]); - d_argb8.val[0] = OP_FUNC(d_argb8.val[0], s_argb8.val[0]); - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); - - dest += 16; - src += 16; -#endif - } - - if(dest < pEndDst) { - C_FUNC_NAME(dest, src, pEndDst - dest -#ifdef BLEND_WITH_OPACITY - , opa -#endif - ); - } -} \ No newline at end of file diff --git a/src/core/visual/ARM/ApplyColorMap.h b/src/core/visual/ARM/ApplyColorMap.h deleted file mode 100644 index 0425a574..00000000 --- a/src/core/visual/ARM/ApplyColorMap.h +++ /dev/null @@ -1,107 +0,0 @@ -static void _CAT_NAME(FUNC_NAME, _NEON)(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len, tjs_uint32 color -#ifdef BLEND_WITH_OPACITY - , tjs_int opa -#endif - ) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - C_FUNC_NAME(dest, src, PreFragLen, color -#ifdef BLEND_WITH_OPACITY - , opa -#endif - ); - dest += PreFragLen; - src += PreFragLen; - } - } - -#ifdef BLEND_WITH_DEST_ALPHA - unsigned char tmpbuff[32 + 8]; - unsigned short *tmpa = __builtin_assume_aligned((unsigned short*)((((intptr_t)tmpbuff) + 15) & ~15), 16); - unsigned char *tmpd = __builtin_assume_aligned((unsigned char*)(tmpa + 8), 16); -#ifdef BLEND_WITH_OPACITY - uint16x8_t opamask = vdupq_n_u16(0xFF00); -#endif -#endif - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - uint8x8_t s_r8 = vdup_n_u8((color >> 16) & 0xFF); - uint8x8_t s_g8 = vdup_n_u8((color >> 8) & 0xFF); - uint8x8_t s_b8 = vdup_n_u8((color >> 0) & 0xFF); - while(dest < pVecEndDst) { - uint8x8_t s_a8 = vld1_u8(src); -#ifdef BLEND_WITH_ADDALPHA -#ifdef BLEND_WITH_OPACITY - uint16x8_t s_a16 = vmulq_n_u16(vmovl_u8(s_a8), opa); -#endif - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); -#ifdef BLEND_WITH_OPACITY - s_a8 = vshrn_n_u16(s_a16, 8); -#endif - //uint16x8_t s_a16 = vsub_u8(s_a8, vshr_n_u8(s_a8, 7)); /* adjust alpha */ - uint16x8_t tmp = vmull_u8(d_argb8.val[3], s_a8); - uint8x8_t s_ia8 = vmvn_u8(s_a8); - uint16x8_t s_r16 = vmull_u8(s_r8, s_a8); - uint16x8_t s_g16 = vmull_u8(s_g8, s_a8); - uint16x8_t s_b16 = vmull_u8(s_b8, s_a8); - uint16x8_t d_r16 = vmull_u8(d_argb8.val[2], s_ia8); - uint16x8_t d_g16 = vmull_u8(d_argb8.val[1], s_ia8); - uint16x8_t d_b16 = vmull_u8(d_argb8.val[0], s_ia8); - tmp = vsubq_u16(vaddl_u8(d_argb8.val[3], s_a8), vshrq_n_u16(tmp, 8)); - d_argb8.val[3] = vsub_u8(vmovn_u16(tmp), vshrn_n_u16(tmp, 8)); - d_argb8.val[2] = vqadd_u8(vshrn_n_u16(d_r16, 8), vshrn_n_u16(s_r16, 8)); - d_argb8.val[1] = vqadd_u8(vshrn_n_u16(d_g16, 8), vshrn_n_u16(s_g16, 8)); - d_argb8.val[0] = vqadd_u8(vshrn_n_u16(d_b16, 8), vshrn_n_u16(s_b16, 8)); -#else -#ifdef BLEND_WITH_OPACITY - uint16x8_t s_a16 = vmulq_n_u16(vmovl_u8(s_a8), opa); -#elif defined(BLEND_WITH_DEST_ALPHA) - uint16x8_t s_a16 = vshll_n_u8(s_a8, 8); -#else - uint16x8_t s_a16 = vmovl_u8(s_a8); -#endif - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); -#if defined(BLEND_WITH_DEST_ALPHA) -#ifdef BLEND_WITH_OPACITY - uint16x8_t isd_a16 = vmull_u8(vmvn_u8(vshrn_n_u16(s_a16, 8)), vmvn_u8(d_argb8.val[3])); - s_a16 = vandq_u16(s_a16, opamask); -#else - uint16x8_t isd_a16 = vmull_u8(vmvn_u8(s_a8), vmvn_u8(d_argb8.val[3])); -#endif - vst1q_u16(tmpa, vorrq_u16(s_a16, vmovl_u8(d_argb8.val[3]))); - d_argb8.val[3] = vmvn_u8(vshrn_n_u16(isd_a16, 8)); - tmpa[0] = TVPOpacityOnOpacityTable[tmpa[0]]; - tmpa[1] = TVPOpacityOnOpacityTable[tmpa[1]]; - tmpa[2] = TVPOpacityOnOpacityTable[tmpa[2]]; - tmpa[3] = TVPOpacityOnOpacityTable[tmpa[3]]; - tmpa[4] = TVPOpacityOnOpacityTable[tmpa[4]]; - tmpa[5] = TVPOpacityOnOpacityTable[tmpa[5]]; - tmpa[6] = TVPOpacityOnOpacityTable[tmpa[6]]; - tmpa[7] = TVPOpacityOnOpacityTable[tmpa[7]]; - s_a16 = vld1q_u16(tmpa); -#elif defined(BLEND_WITH_OPACITY) - s_a16 = vshrq_n_u16(s_a16, 8); -#endif - uint16x8_t d_r16 = vmulq_u16(vsubl_u8(s_r8, d_argb8.val[2]), s_a16); - uint16x8_t d_g16 = vmulq_u16(vsubl_u8(s_g8, d_argb8.val[1]), s_a16); - uint16x8_t d_b16 = vmulq_u16(vsubl_u8(s_b8, d_argb8.val[0]), s_a16); - d_argb8.val[2] = vadd_u8(d_argb8.val[2], vshrn_n_u16(d_r16, 8)); - d_argb8.val[1] = vadd_u8(d_argb8.val[1], vshrn_n_u16(d_g16, 8)); - d_argb8.val[0] = vadd_u8(d_argb8.val[0], vshrn_n_u16(d_b16, 8)); -#endif - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); - dest += 8; - src += 8; - } - - if(dest < pEndDst) { - C_FUNC_NAME(dest, src, pEndDst - dest, color -#ifdef BLEND_WITH_OPACITY - , opa -#endif - ); - } -} \ No newline at end of file diff --git a/src/core/visual/ARM/ConstAlphaBlend.h b/src/core/visual/ARM/ConstAlphaBlend.h deleted file mode 100644 index 220327b6..00000000 --- a/src/core/visual/ARM/ConstAlphaBlend.h +++ /dev/null @@ -1,155 +0,0 @@ -static void _CAT_NAME(FUNC_NAME, _NEON)(tjs_uint32 *dest, -#ifdef STRECH_FUNC - tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep -#elif defined(LINEAR_TRANS_FUNC) - tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch -#else - const tjs_uint32 *src, tjs_int len -#endif - , tjs_int copa) -{ - tjs_uint32* pEndDst = dest + len; - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - C_FUNC_NAME(dest, -#ifdef STRECH_FUNC - PreFragLen, src, srcstart, srcstep -#elif defined(LINEAR_TRANS_FUNC) - PreFragLen, src, sx, sy, stepx, stepy, srcpitch -#else - src, PreFragLen -#endif - ,copa); - dest += PreFragLen; -#ifdef STRECH_FUNC - srcstart += srcstep * PreFragLen; -#elif defined(LINEAR_TRANS_FUNC) - sx += stepx * PreFragLen; - sy += stepy * PreFragLen; -#else - src += PreFragLen; -#endif - } - tjs_int opa = copa; -#if defined(STRECH_FUNC) || defined(LINEAR_TRANS_FUNC) - unsigned char strechbuff[32 + 16]; - tjs_uint32 *strechsrc = __builtin_assume_aligned((tjs_uint32*)((((intptr_t)strechbuff) + 15) & ~15), 16); - if(opa > 128) opa ++; /* adjust for error */ -#endif - - -#ifdef BLEND_WITH_DEST_ALPHA - unsigned char tmpbuff[32 + 8]; - unsigned short *tmpa = __builtin_assume_aligned((unsigned short*)((((intptr_t)tmpbuff) + 15) & ~15), 16); - unsigned char *tmpd = __builtin_assume_aligned((unsigned char*)(tmpa + 8), 16); - //uint16x8_t opa16 = vdupq_n_u16(opa); - uint16x8_t hopa16 = vdupq_n_u16(copa << 8); - uint8x8_t iopa8 = vdup_n_u8(~opa); -#endif - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - while(dest < pVecEndDst) { -#if defined(STRECH_FUNC) || defined(LINEAR_TRANS_FUNC) - for(int i = 0; i < 8; ++i) { -#ifdef STRECH_FUNC - strechsrc[i] = src[(srcstart) >> 16]; - srcstart += srcstep; -#elif defined(LINEAR_TRANS_FUNC) - strechsrc[i] = *( (const tjs_uint32*)((const tjs_uint8*)src + (sy>>16)*srcpitch) + (sx>>16)); - sx += stepx; - sy += stepy; -#endif - } - uint8x8x4_t s = vld4_u8((unsigned char *)strechsrc); -#else - uint8x8x4_t s = vld4_u8((unsigned char *)src); -#endif - uint8x8x4_t d = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); -#ifdef BLEND_WITH_ADDALPHA - s.val[3] = vdup_n_u8(opa); - uint16x8_t d_a16 = vmull_u8(s.val[3], d.val[3]); - //Da = Sa + Da - SaDa - d_a16 = vsubq_u16(vaddl_u8(s.val[3], d.val[3]), vshrq_n_u16(d_a16, 8)); - d.val[3] = vmovn_u16(vsubq_u16(d_a16, vshrq_n_u16(d_a16, 8))); - - // Di = sat(Si, (1-Sa)*Di) - s.val[3] = vmvn_u8(s.val[3]); - uint16x8_t d_r16 = vmull_u8(d.val[2], s.val[3]); - uint16x8_t d_g16 = vmull_u8(d.val[1], s.val[3]); - uint16x8_t d_b16 = vmull_u8(d.val[0], s.val[3]); - - // 8-bit to do saturated add - d.val[2] = vqadd_u8(s.val[2], vshrn_n_u16(d_r16, 8)); - d.val[1] = vqadd_u8(s.val[1], vshrn_n_u16(d_g16, 8)); - d.val[0] = vqadd_u8(s.val[0], vshrn_n_u16(d_b16, 8)); -#else -#ifdef BLEND_WITH_DEST_ALPHA - uint16x8_t isd_a16 = vmull_u8(iopa8, vmvn_u8(d.val[3])); -#if 1 - uint16x8_t s_a16 = vorrq_u16(hopa16, vmovl_u8(d.val[3])); - d.val[3] = vmvn_u8(vshrn_n_u16(isd_a16, 8)); - vst1q_u16(tmpa, s_a16); - tmpd[0] = TVPOpacityOnOpacityTable[tmpa[0]]; - tmpd[1] = TVPOpacityOnOpacityTable[tmpa[1]]; - tmpd[2] = TVPOpacityOnOpacityTable[tmpa[2]]; - tmpd[3] = TVPOpacityOnOpacityTable[tmpa[3]]; - tmpd[4] = TVPOpacityOnOpacityTable[tmpa[4]]; - tmpd[5] = TVPOpacityOnOpacityTable[tmpa[5]]; - tmpd[6] = TVPOpacityOnOpacityTable[tmpa[6]]; - tmpd[7] = TVPOpacityOnOpacityTable[tmpa[7]]; - s_a16 = vmovl_u8(vld1_u8(tmpd)); -#else - uint16x8_t d_a16 = vmovl_u8(d.val[3]); - uint16x8_t sd_a16 = vmulq_u16(opa16, d_a16); - uint16x8_t sopa = vshlq_n_u16(vaddq_u16(opa16, d_a16), 8); - d.val[3] = vmvn_u8(vshrn_n_u16(isd_a16, 8)); - vst1q_u16(tmpa, vshrq_n_u16(vsubq_u16(sopa, sd_a16), 8)); - tmpa[0] = TVPRecipTableForOpacityOnOpacity[tmpa[0]]; - tmpa[1] = TVPRecipTableForOpacityOnOpacity[tmpa[1]]; - tmpa[2] = TVPRecipTableForOpacityOnOpacity[tmpa[2]]; - tmpa[3] = TVPRecipTableForOpacityOnOpacity[tmpa[3]]; - tmpa[4] = TVPRecipTableForOpacityOnOpacity[tmpa[4]]; - tmpa[5] = TVPRecipTableForOpacityOnOpacity[tmpa[5]]; - tmpa[6] = TVPRecipTableForOpacityOnOpacity[tmpa[6]]; - tmpa[7] = TVPRecipTableForOpacityOnOpacity[tmpa[7]]; - uint16x8_t s_a16 = vmulq_u16(vld1q_u16(tmpa), opa16); - s_a16 = vshrq_n_u16(s_a16, 8); -#endif - - // d = d + (s - d) * opa - uint16x8_t d_r16 = vsubl_u8(s.val[2], d.val[2]); - uint16x8_t d_g16 = vsubl_u8(s.val[1], d.val[1]); - uint16x8_t d_b16 = vsubl_u8(s.val[0], d.val[0]); - d_r16 = vmulq_u16(d_r16, s_a16); - d_g16 = vmulq_u16(d_g16, s_a16); - d_b16 = vmulq_u16(d_b16, s_a16); -#else - // d = d + (s - d) * opa - uint16x8_t d_r16 = vmulq_n_u16(vsubl_u8(s.val[2], d.val[2]), opa); - uint16x8_t d_g16 = vmulq_n_u16(vsubl_u8(s.val[1], d.val[1]), opa); - uint16x8_t d_b16 = vmulq_n_u16(vsubl_u8(s.val[0], d.val[0]), opa); -#endif // BLEND_WITH_DEST_ALPHA - d.val[2] = vadd_u8(d.val[2], vshrn_n_u16(d_r16, 8)); - d.val[1] = vadd_u8(d.val[1], vshrn_n_u16(d_g16, 8)); - d.val[0] = vadd_u8(d.val[0], vshrn_n_u16(d_b16, 8)); -#endif // BLEND_WITH_ADDALPHA - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); -#if defined(STRECH_FUNC) || defined(LINEAR_TRANS_FUNC) -#else - src += 8; -#endif - dest += 8; - } - - if(dest < pEndDst) { - C_FUNC_NAME(dest, -#ifdef STRECH_FUNC - pEndDst - dest, src, srcstart, srcstep -#elif defined(LINEAR_TRANS_FUNC) - pEndDst - dest, src, sx, sy, stepx, stepy, srcpitch -#else - src, pEndDst - dest -#endif - ,copa); - } -} \ No newline at end of file diff --git a/src/core/visual/ARM/ConstAlphaBlend2.h b/src/core/visual/ARM/ConstAlphaBlend2.h deleted file mode 100644 index a245cc85..00000000 --- a/src/core/visual/ARM/ConstAlphaBlend2.h +++ /dev/null @@ -1,78 +0,0 @@ -static void _CAT_NAME(FUNC_NAME, _NEON)(tjs_uint32 *dest, const tjs_uint32 *src1, const tjs_uint32 *src2, tjs_int len - , tjs_int opa) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - C_FUNC_NAME(dest, src1, src2, - PreFragLen - , opa); - dest += PreFragLen; - src1 += PreFragLen; - src2 += PreFragLen; - } - } - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; -#ifdef BLEND_WITH_DEST_ALPHA - unsigned char tmpbuff[32 + 8]; - unsigned short *tmpa = (unsigned short*)((((intptr_t)tmpbuff) + 15) & ~15); - tjs_int o = opa; - if(o > 127) o ++; /* adjust for error */ - uint8x8_t opa8 = vdup_n_u8(o); - uint8x8_t iopa8 = vdup_n_u8(256 - o); -#endif - while(dest < pVecEndDst) { - uint8x8x4_t s1 = vld4_u8((unsigned char *)src1); -#ifdef BLEND_WITH_DEST_ALPHA - uint16x8_t s1_a16 = vmull_u8(s1.val[3], iopa8); -#endif - uint8x8x4_t s2 = vld4_u8((unsigned char *)src2); -#if defined(BLEND_WITH_DEST_ALPHA) - uint16x8_t o16 = vmull_u8(s2.val[3], opa8); - - o16 = vsriq_n_u16(o16, s1_a16, 8); // addr - vst1q_u16(tmpa, o16); - for(int i = 0; i < 8; ++i) { - unsigned int addr = tmpa[i]; - tmpa[i] = TVPOpacityOnOpacityTable[addr]; - } - o16 = vld1q_u16(tmpa); - - //uint16x8_t s_a16 = vmulq_u16(vsubl_u8(a.val[3], b.val[3]), s_a16); - uint16x8_t d_r16 = vmulq_u16(vsubl_u8(s2.val[2], s1.val[2]), o16); - uint16x8_t d_g16 = vmulq_u16(vsubl_u8(s2.val[1], s1.val[1]), o16); - uint16x8_t d_b16 = vmulq_u16(vsubl_u8(s2.val[0], s1.val[0]), o16); -#else // BLEND_WITH_DEST_ALPHA - // s1 * o + s2 * (1 - o) => s1 + (s2 - s1) * o -#ifdef BLEND_WITH_ADDALPHA - uint16x8_t d_a16 = vmulq_n_u16(vsubl_u8(s2.val[3], s1.val[3]), opa); -#endif - uint16x8_t d_r16 = vmulq_n_u16(vsubl_u8(s2.val[2], s1.val[2]), opa); - uint16x8_t d_g16 = vmulq_n_u16(vsubl_u8(s2.val[1], s1.val[1]), opa); - uint16x8_t d_b16 = vmulq_n_u16(vsubl_u8(s2.val[0], s1.val[0]), opa); -#endif //BLEND_WITH_DEST_ALPHA - - // s * o >> 8 -#ifdef BLEND_WITH_ADDALPHA - s2.val[3] = vadd_u8(s1.val[3], vshrn_n_u16(d_a16, 8)); -#else - //s2.val[3] = vdup_n_u8(0); -#endif - s2.val[2] = vadd_u8(s1.val[2], vshrn_n_u16(d_r16, 8)); - s2.val[1] = vadd_u8(s1.val[1], vshrn_n_u16(d_g16, 8)); - s2.val[0] = vadd_u8(s1.val[0], vshrn_n_u16(d_b16, 8)); - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), s2); - - src1 += 8; - src2 += 8; - dest += 8; - } - - if(dest < pEndDst) { - C_FUNC_NAME(dest, src1, src2, - pEndDst - dest - , opa); - } -} \ No newline at end of file diff --git a/src/core/visual/ARM/ConstColorAlphaBlend.h b/src/core/visual/ARM/ConstColorAlphaBlend.h deleted file mode 100644 index b116d0ff..00000000 --- a/src/core/visual/ARM/ConstColorAlphaBlend.h +++ /dev/null @@ -1,103 +0,0 @@ -static void _CAT_NAME(FUNC_NAME, _NEON)(tjs_uint32 *dest, tjs_int len, tjs_uint32 color, tjs_int opa) -{ - /* this function always holds desitination alpha channel */ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - C_FUNC_NAME(dest, PreFragLen, color, opa); - dest += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; -#ifdef BLEND_WITH_DEST_ALPHA - unsigned char tmpbuff[32 + 8]; - unsigned short *tmpa = __builtin_assume_aligned((unsigned short*)((((intptr_t)tmpbuff) + 15) & ~15), 16); - unsigned char *tmpd = __builtin_assume_aligned((unsigned char *)(tmpa + 8), 16); - uint16x8_t hopa16 = vdupq_n_u16(opa << 8); - uint8x8_t s_r8 = vdup_n_u8((color >> 16) & 0xFF); - uint8x8_t s_g8 = vdup_n_u8((color >> 8) & 0xFF); - uint8x8_t s_b8 = vdup_n_u8((color >> 0) & 0xFF); -#elif defined(BLEND_WITH_ADDALPHA) - uint8x8_t s_r8 = vdup_n_u8((((color >> 16) & 0xFF) * opa) >> 8); - uint8x8_t s_g8 = vdup_n_u8((((color >> 8) & 0xFF) * opa) >> 8); - uint8x8_t s_b8 = vdup_n_u8((((color >> 0) & 0xFF) * opa) >> 8); - uint8x8_t s_a8 = vdup_n_u8(opa); -#else - uint16x8_t s_r16 = vdupq_n_u16(((color >> 16) & 0xFF) * opa); - uint16x8_t s_g16 = vdupq_n_u16(((color >> 8) & 0xFF) * opa); - uint16x8_t s_b16 = vdupq_n_u16(((color >> 0) & 0xFF) * opa); -#endif - uint8x8_t s_ia8 = vdup_n_u8(opa ^ 0xFF); - while(dest < pVecEndDst) { - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - -#ifdef BLEND_WITH_ADDALPHA - uint16x8_t tmp = vmull_u8(d_argb8.val[3], s_a8); - uint16x8_t d_r16 = vmull_u8(d_argb8.val[2], s_ia8); - uint16x8_t d_g16 = vmull_u8(d_argb8.val[1], s_ia8); - uint16x8_t d_b16 = vmull_u8(d_argb8.val[0], s_ia8); - tmp = vsubq_u16(vaddl_u8(d_argb8.val[3], s_a8), vshrq_n_u16(tmp, 8)); - d_argb8.val[3] = vsub_u8(vmovn_u16(tmp), vshrn_n_u16(tmp, 8)); - d_argb8.val[2] = vqadd_u8(vshrn_n_u16(d_r16, 8), s_r8); - d_argb8.val[1] = vqadd_u8(vshrn_n_u16(d_g16, 8), s_g8); - d_argb8.val[0] = vqadd_u8(vshrn_n_u16(d_b16, 8), s_b8); -#elif defined(BLEND_WITH_DEST_ALPHA) - uint16x8_t isd_a16 = vmull_u8(s_ia8, vmvn_u8(d_argb8.val[3])); -#if 1 - uint16x8_t s_a16 = vorrq_u16(hopa16, vmovl_u8(d_argb8.val[3])); - d_argb8.val[3] = vmvn_u8(vshrn_n_u16(isd_a16, 8)); //(255-((255-dopa)*(255-opa)>>8)) - vst1q_u16(tmpa, s_a16); - tmpd[0] = TVPOpacityOnOpacityTable[tmpa[0]]; - tmpd[1] = TVPOpacityOnOpacityTable[tmpa[1]]; - tmpd[2] = TVPOpacityOnOpacityTable[tmpa[2]]; - tmpd[3] = TVPOpacityOnOpacityTable[tmpa[3]]; - tmpd[4] = TVPOpacityOnOpacityTable[tmpa[4]]; - tmpd[5] = TVPOpacityOnOpacityTable[tmpa[5]]; - tmpd[6] = TVPOpacityOnOpacityTable[tmpa[6]]; - tmpd[7] = TVPOpacityOnOpacityTable[tmpa[7]]; - s_a16 = vmovl_u8(vld1_u8(tmpd)); -#else - uint16x8_t d_a16 = vmovl_u8(d_argb8.val[3]); - uint16x8_t sd_a16 = vmulq_u16(opa16, d_a16); - uint16x8_t sopa = vshlq_n_u16(vaddq_u16(opa16, d_a16), 8); - d_argb8.val[3] = vmvn_u8(vshrn_n_u16(isd_a16, 8)); //(255-((255-dopa)*(255-opa)>>8)) - vst1q_u16(tmpa, vshrq_n_u16(vsubq_u16(sopa, sd_a16), 8)); - tmpa[0] = TVPRecipTableForOpacityOnOpacity[tmpa[0]]; - tmpa[1] = TVPRecipTableForOpacityOnOpacity[tmpa[1]]; - tmpa[2] = TVPRecipTableForOpacityOnOpacity[tmpa[2]]; - tmpa[3] = TVPRecipTableForOpacityOnOpacity[tmpa[3]]; - tmpa[4] = TVPRecipTableForOpacityOnOpacity[tmpa[4]]; - tmpa[5] = TVPRecipTableForOpacityOnOpacity[tmpa[5]]; - tmpa[6] = TVPRecipTableForOpacityOnOpacity[tmpa[6]]; - tmpa[7] = TVPRecipTableForOpacityOnOpacity[tmpa[7]]; - uint16x8_t s_a16 = vmulq_u16(vld1q_u16(tmpa), opa16); - s_a16 = vshrq_n_u16(s_a16, 8); -#endif - uint16x8_t d_r16 = vsubl_u8(s_r8, d_argb8.val[2]); - uint16x8_t d_g16 = vsubl_u8(s_g8, d_argb8.val[1]); - uint16x8_t d_b16 = vsubl_u8(s_b8, d_argb8.val[0]); - d_r16 = vmulq_u16(d_r16, s_a16); - d_g16 = vmulq_u16(d_g16, s_a16); - d_b16 = vmulq_u16(d_b16, s_a16); - d_argb8.val[2] = vadd_u8(d_argb8.val[2], vshrn_n_u16(d_r16, 8)); - d_argb8.val[1] = vadd_u8(d_argb8.val[1], vshrn_n_u16(d_g16, 8)); - d_argb8.val[0] = vadd_u8(d_argb8.val[0], vshrn_n_u16(d_b16, 8)); -#else - uint16x8_t d_r16 = vmull_u8(d_argb8.val[2], s_ia8); - uint16x8_t d_g16 = vmull_u8(d_argb8.val[1], s_ia8); - uint16x8_t d_b16 = vmull_u8(d_argb8.val[0], s_ia8); - d_argb8.val[2] = vshrn_n_u16(vaddq_u16(d_r16, s_r16), 8); - d_argb8.val[1] = vshrn_n_u16(vaddq_u16(d_g16, s_g16), 8); - d_argb8.val[0] = vshrn_n_u16(vaddq_u16(d_b16, s_b16), 8); -#endif - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); - dest += 8; - } - - if(dest < pEndDst) { - C_FUNC_NAME(dest, pEndDst - dest, color, opa); - } -} \ No newline at end of file diff --git a/src/core/visual/ARM/InterpTransBlend.h b/src/core/visual/ARM/InterpTransBlend.h deleted file mode 100644 index 17e60c0b..00000000 --- a/src/core/visual/ARM/InterpTransBlend.h +++ /dev/null @@ -1,220 +0,0 @@ -static void _CAT_NAME(FUNC_NAME, _NEON)( -tjs_uint32 *dest, tjs_int len -#ifdef LINEAR_TRANS_FUNC - , const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch -#else //STRECH_FUNC - , const tjs_uint32 *src1, const tjs_uint32 *src2, tjs_int _blend_y, tjs_int srcstart, tjs_int srcstep -#endif // LINEAR_TRANS_FUNC -#ifdef BLEND_WITH_OPACITY - , tjs_int _opa -#endif - ) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - C_FUNC_NAME(dest, PreFragLen -#ifdef LINEAR_TRANS_FUNC - , src, sx, sy, stepx, stepy, srcpitch -#else //STRECH_FUNC - , src1, src2, _blend_y, srcstart, srcstep -#endif // LINEAR_TRANS_FUNC -#ifdef BLEND_WITH_OPACITY - , _opa -#endif - ); - dest += PreFragLen; -#ifdef LINEAR_TRANS_FUNC - sx += stepx * PreFragLen; - sy += stepy * PreFragLen; -#else //STRECH_FUNC - srcstart += PreFragLen * srcstep; -#endif // LINEAR_TRANS_FUNC - } - } - -#ifdef LINEAR_TRANS_FUNC - unsigned char tmpbuff[4 * 8 * 4 + 2 * 8 * 2 + 16]; - tjs_uint32 *tmp0_0 = (tjs_uint32*)((((intptr_t)tmpbuff) + 15) & ~15); - tjs_uint32 *tmp0_1 = tmp0_0 + 8; - tjs_uint32 *tmp1_0 = tmp0_1 + 8; - tjs_uint32 *tmp1_1 = tmp1_0 + 8; - uint16_t *blend_x = (uint16_t *)(tmp1_1 + 8); - uint16_t *blend_y = (blend_x + 8); -#else //STRECH_FUNC - tjs_int blend_y = _blend_y + (_blend_y >> 7); /* adjust blend ratio */ - - unsigned char tmpbuff[4 * 8 * 3 + 16]; - tjs_uint32 *tmp1_0 = (tjs_uint32*)((((intptr_t)tmpbuff) + 15) & ~15); - tjs_uint32 *tmp1_1 = tmp1_0 + 8; - uint16_t *blend_x = (uint16_t *)(tmp1_1 + 8); -#endif // LINEAR_TRANS_FUNC - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - -#ifdef BLEND_WITH_ADDALPHA -#ifdef BLEND_WITH_OPACITY - uint8x8_t opa8 = vdup_n_u8(_opa); -#endif -#elif defined(BLEND_WITH_OPACITY) - tjs_int opa = _opa + (_opa >> 7); /* adjust opa */ -#endif - while(dest < pVecEndDst) { - uint8x8x4_t s_argb8; -#ifdef STRECH_FUNC - tjs_int start = srcstart; -#endif - { - for(int i = 0; i < 8; ++i) { -#ifdef LINEAR_TRANS_FUNC - const tjs_uint32 *p0, *p1; - int bld_x, bld_y; - bld_x = (sx & 0xffff) >> 8; - bld_x += bld_x >> 7; - bld_y = (sy & 0xffff) >> 8; - bld_y += bld_y >> 7; - blend_x[i] = bld_x; - blend_y[i] = bld_y; - - p0 = (const tjs_uint32*)((const tjs_uint8*)src + ((sy>>16) )*srcpitch) + (sx>>16); - p1 = (const tjs_uint32*)((const tjs_uint8*)p0 + srcpitch); - - tmp0_0[i] = p0[0]; - tmp0_1[i] = p0[1]; - tmp1_0[i] = p1[0]; - tmp1_1[i] = p1[1]; - - sx += stepx; - sy += stepy; -#else //STRECH_FUNC - int addr = start >> 16; - tmp1_0[i] = src2[addr]; - tmp1_1[i] = src2[addr + 1]; - blend_x[i] = (start & 0xffff) >> 8; - start += srcstep; -#endif // LINEAR_TRANS_FUNC - } - // TVPBlendARGB(src2[sp], src2[sp+1], blend_x) - uint8x8x4_t b = vld4_u8((unsigned char *)tmp1_0); - uint8x8x4_t a = vld4_u8((unsigned char *)tmp1_1); - uint16x8_t ratio = vld1q_u16(blend_x); // qreg = 5 - // TVPBlendARGB: a * ratio + b * (1 - ratio) => b + (a - b) * ratio - uint16x8_t s_a16 = vmulq_u16(vsubl_u8(a.val[3], b.val[3]), ratio); - uint16x8_t s_r16 = vmulq_u16(vsubl_u8(a.val[2], b.val[2]), ratio); - uint16x8_t s_g16 = vmulq_u16(vsubl_u8(a.val[1], b.val[1]), ratio); - uint16x8_t s_b16 = vmulq_u16(vsubl_u8(a.val[0], b.val[0]), ratio); // qreg = 9 - - s_argb8.val[3] = vadd_u8(b.val[3], vshrn_n_u16(s_a16, 8)); - s_argb8.val[2] = vadd_u8(b.val[2], vshrn_n_u16(s_r16, 8)); - s_argb8.val[1] = vadd_u8(b.val[1], vshrn_n_u16(s_g16, 8)); - s_argb8.val[0] = vadd_u8(b.val[0], vshrn_n_u16(s_b16, 8)); // qreg = 11 - -#ifdef LINEAR_TRANS_FUNC - b = vld4_u8((unsigned char *)tmp0_0); - a = vld4_u8((unsigned char *)tmp0_1); -#else //STRECH_FUNC - start = srcstart; - for(int i = 0; i < 8; ++i) { - int addr = (start) >> 16; - tmp1_0[i] = src1[addr]; - tmp1_1[i] = src1[addr + 1]; - start += srcstep; - } - // TVPBlendARGB(src1[sp], src1[sp+1], blend_x) - b = vld4_u8((unsigned char *)tmp1_0); - a = vld4_u8((unsigned char *)tmp1_1); -#endif // LINEAR_TRANS_FUNC - s_a16 = vmulq_u16(vsubl_u8(a.val[3], b.val[3]), ratio); - s_r16 = vmulq_u16(vsubl_u8(a.val[2], b.val[2]), ratio); - s_g16 = vmulq_u16(vsubl_u8(a.val[1], b.val[1]), ratio); - s_b16 = vmulq_u16(vsubl_u8(a.val[0], b.val[0]), ratio); - uint8x8x4_t s2; - s2.val[3] = vadd_u8(b.val[3], vshrn_n_u16(s_a16, 8)); - s2.val[2] = vadd_u8(b.val[2], vshrn_n_u16(s_r16, 8)); - s2.val[1] = vadd_u8(b.val[1], vshrn_n_u16(s_g16, 8)); - s2.val[0] = vadd_u8(b.val[0], vshrn_n_u16(s_b16, 8)); // qreg = 13 - - // TVPBlendARGB -#ifdef LINEAR_TRANS_FUNC - ratio = vld1q_u16(blend_y); - s_a16 = vmulq_u16(vsubl_u8(s_argb8.val[3], s2.val[3]), ratio); - s_r16 = vmulq_u16(vsubl_u8(s_argb8.val[2], s2.val[2]), ratio); - s_g16 = vmulq_u16(vsubl_u8(s_argb8.val[1], s2.val[1]), ratio); - s_b16 = vmulq_u16(vsubl_u8(s_argb8.val[0], s2.val[0]), ratio); -#else //STRECH_FUNC - s_a16 = vmulq_n_u16(vsubl_u8(s_argb8.val[3], s2.val[3]), blend_y); - s_r16 = vmulq_n_u16(vsubl_u8(s_argb8.val[2], s2.val[2]), blend_y); - s_g16 = vmulq_n_u16(vsubl_u8(s_argb8.val[1], s2.val[1]), blend_y); - s_b16 = vmulq_n_u16(vsubl_u8(s_argb8.val[0], s2.val[0]), blend_y); -#endif // LINEAR_TRANS_FUNC - s_argb8.val[3] = vadd_u8(s2.val[3], vshrn_n_u16(s_a16, 8)); - s_argb8.val[2] = vadd_u8(s2.val[2], vshrn_n_u16(s_r16, 8)); - s_argb8.val[1] = vadd_u8(s2.val[1], vshrn_n_u16(s_g16, 8)); - s_argb8.val[0] = vadd_u8(s2.val[0], vshrn_n_u16(s_b16, 8)); -#ifdef BLEND_WITH_OPACITY -#ifdef BLEND_WITH_ADDALPHA - { - s_a16 = vmull_u8(s_argb8.val[3], opa8); - s_r16 = vmull_u8(s_argb8.val[2], opa8); - s_g16 = vmull_u8(s_argb8.val[1], opa8); - s_b16 = vmull_u8(s_argb8.val[0], opa8); - s_argb8.val[3] = vshrn_n_u16(s_a16, 8); - s_argb8.val[2] = vshrn_n_u16(s_r16, 8); - s_argb8.val[1] = vshrn_n_u16(s_g16, 8); - s_argb8.val[0] = vshrn_n_u16(s_b16, 8); - } -#endif -#endif - } -#ifdef COPY_FUNC - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), s_argb8); -#else - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); -#ifdef BLEND_WITH_ADDALPHA - // TVPAddAlphaBlend_n_a - s_argb8.val[3] = vmvn_u8(s_argb8.val[3]); // 1 - a - - // s + d * (1 - sa) - uint16x8_t d_r16 = vmull_u8(d_argb8.val[2], s_argb8.val[3]); - uint16x8_t d_g16 = vmull_u8(d_argb8.val[1], s_argb8.val[3]); - uint16x8_t d_b16 = vmull_u8(d_argb8.val[0], s_argb8.val[3]); - - // 8-bit to do saturated add - d_argb8.val[2] = vqadd_u8(vshrn_n_u16(d_r16, 8), s_argb8.val[2]); - d_argb8.val[1] = vqadd_u8(vshrn_n_u16(d_g16, 8), s_argb8.val[1]); - d_argb8.val[0] = vqadd_u8(vshrn_n_u16(d_b16, 8), s_argb8.val[0]); -#else - // TVPBlendARGB - uint16x8_t d_a16 = vmulq_n_u16(vsubl_u8(s_argb8.val[3], d_argb8.val[3]), opa); - uint16x8_t d_r16 = vmulq_n_u16(vsubl_u8(s_argb8.val[2], d_argb8.val[2]), opa); - uint16x8_t d_g16 = vmulq_n_u16(vsubl_u8(s_argb8.val[1], d_argb8.val[1]), opa); - uint16x8_t d_b16 = vmulq_n_u16(vsubl_u8(s_argb8.val[0], d_argb8.val[0]), opa); - d_argb8.val[3] = vadd_u8(d_argb8.val[3], vshrn_n_u16(d_a16, 8)); - d_argb8.val[2] = vadd_u8(d_argb8.val[2], vshrn_n_u16(d_r16, 8)); - d_argb8.val[1] = vadd_u8(d_argb8.val[1], vshrn_n_u16(d_g16, 8)); - d_argb8.val[0] = vadd_u8(d_argb8.val[0], vshrn_n_u16(d_b16, 8)); -#endif - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); -#endif - -#ifdef STRECH_FUNC - srcstart = start; -#endif - dest += 8; - } - - if(dest < pEndDst) { - C_FUNC_NAME(dest, pEndDst - dest -#ifdef LINEAR_TRANS_FUNC - , src, sx, sy, stepx, stepy, srcpitch -#else //STRECH_FUNC - , src1, src2, _blend_y, srcstart, srcstep -#endif // LINEAR_TRANS_FUNC -#ifdef BLEND_WITH_OPACITY - , _opa -#endif - ); - } -} \ No newline at end of file diff --git a/src/core/visual/ARM/MulBlend.h b/src/core/visual/ARM/MulBlend.h deleted file mode 100644 index e524ea34..00000000 --- a/src/core/visual/ARM/MulBlend.h +++ /dev/null @@ -1,62 +0,0 @@ -static void _CAT_NAME(FUNC_NAME, _NEON)(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len -#ifdef BLEND_WITH_OPACITY - , tjs_int opa -#endif - ) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - C_FUNC_NAME(dest, src, PreFragLen -#ifdef BLEND_WITH_OPACITY - , opa -#endif - ); - dest += PreFragLen; - src += PreFragLen; - } - } - -#ifdef BLEND_WITH_OPACITY - uint8x8_t opa8 = vdup_n_u8(opa); -#endif - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - while(dest < pVecEndDst) { - uint8x8x4_t s_argb8 = vld4_u8((unsigned char*)src); -#ifdef BLEND_WITH_OPACITY - s_argb8.val[2] = vmvn_u8(s_argb8.val[2]); - s_argb8.val[1] = vmvn_u8(s_argb8.val[1]); - s_argb8.val[0] = vmvn_u8(s_argb8.val[0]); - uint16x8_t s_r16 = vmull_u8(s_argb8.val[2], opa8); - uint16x8_t s_g16 = vmull_u8(s_argb8.val[1], opa8); - uint16x8_t s_b16 = vmull_u8(s_argb8.val[0], opa8); - s_argb8.val[2] = vmvn_u8(vshrn_n_u16(s_r16, 8)); - s_argb8.val[1] = vmvn_u8(vshrn_n_u16(s_g16, 8)); - s_argb8.val[0] = vmvn_u8(vshrn_n_u16(s_b16, 8)); -#endif - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - uint16x8_t d_r16 = vmull_u8(s_argb8.val[2], d_argb8.val[2]); - uint16x8_t d_g16 = vmull_u8(s_argb8.val[1], d_argb8.val[1]); - uint16x8_t d_b16 = vmull_u8(s_argb8.val[0], d_argb8.val[0]); - d_argb8.val[2] = vshrn_n_u16(d_r16, 8); - d_argb8.val[1] = vshrn_n_u16(d_g16, 8); - d_argb8.val[0] = vshrn_n_u16(d_b16, 8); -#ifdef NON_HDA - d_argb8.val[3] = vdup_n_u8(0); -#endif - - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); - dest += 8; - src += 8; - } - - if(dest < pEndDst) { - C_FUNC_NAME(dest, src, pEndDst - dest -#ifdef BLEND_WITH_OPACITY - , opa -#endif - ); - } -} \ No newline at end of file diff --git a/src/core/visual/ARM/TLG6_do_chroma.h b/src/core/visual/ARM/TLG6_do_chroma.h deleted file mode 100644 index b9838643..00000000 --- a/src/core/visual/ARM/TLG6_do_chroma.h +++ /dev/null @@ -1,28 +0,0 @@ -case (N<<1): - do { - tjs_uint32 clr = *in; - clr = FILTER; - uint8x8_t v = vreinterpret_u8_u32(vdup_n_u32(clr)); - uint32x2_t u = vdup_n_u32(*prevline++); -#ifdef DEBUG_ARM_NEON - uint8x8_t m = med_NEON(p, u, up); -#else - med_NEON(p, u, up); -#endif - p = vreinterpret_u32_u8(vadd_u8(m, v)); - *curline ++ = vget_lane_u32(p, 0); - up = u; - in += step; - } while(--w); - break; -case (N<<1)+1: - do { - tjs_uint32 clr = *in; - clr = FILTER; - uint8x8_t v = vreinterpret_u8_u32(vdup_n_u32(clr)); - up = vdup_n_u32(*prevline++); - p = vreinterpret_u32_u8(vadd_u8(vrhadd_u8(vreinterpret_u8_u32(p), vreinterpret_u8_u32(up)), v)); - *curline ++ = vget_lane_u32(p, 0); - in += step; - } while(--w); - break; \ No newline at end of file diff --git a/src/core/visual/ARM/addalphablend.h b/src/core/visual/ARM/addalphablend.h deleted file mode 100644 index 269ee2b7..00000000 --- a/src/core/visual/ARM/addalphablend.h +++ /dev/null @@ -1,122 +0,0 @@ -static void _CAT_NAME(FUNC_NAME, _NEON)(tjs_uint32 *dest, -#ifdef STRECH_FUNC - tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep -#elif defined(LINEAR_TRANS_FUNC) - tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch -#else - const tjs_uint32 *src, tjs_int len -#endif -#ifdef BLEND_WITH_OPACITY - , tjs_int opa -#endif - ) -{ - tjs_uint32* pEndDst = dest + len; - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - C_FUNC_NAME(dest, -#ifdef STRECH_FUNC - PreFragLen, src, srcstart, srcstep -#elif defined(LINEAR_TRANS_FUNC) - PreFragLen, src, sx, sy, stepx, stepy, srcpitch -#else - src, PreFragLen -#endif -#ifdef BLEND_WITH_OPACITY - , opa -#endif - ); - dest += PreFragLen; -#ifdef STRECH_FUNC - srcstart += PreFragLen * srcstep; -#elif defined(LINEAR_TRANS_FUNC) - sx += stepx * PreFragLen; - sy += stepy * PreFragLen; -#else - src += PreFragLen; -#endif - } - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; -#if defined(STRECH_FUNC) || defined(LINEAR_TRANS_FUNC) - unsigned char strechbuff[32 + 16]; - tjs_uint32 *strechsrc = (tjs_uint32*)((((intptr_t)strechbuff) + 15) & ~15); -#endif -#ifdef BLEND_WITH_OPACITY - uint8x8_t opa8 = vdup_n_u8(opa); -#endif - while(dest < pVecEndDst) { -#if defined(STRECH_FUNC) || defined(LINEAR_TRANS_FUNC) - for(int i = 0; i < 8; ++i) { -#ifdef STRECH_FUNC - strechsrc[i] = src[(srcstart + srcstep * i) >> 16]; -#elif defined(LINEAR_TRANS_FUNC) - strechsrc[i] = *((const tjs_uint32*)((const tjs_uint8*)src + ((sy + stepy * i) >> 16)*srcpitch) + ((sx + stepx * i) >> 16)); -#endif - } - uint8x8x4_t s_argb8 = vld4_u8((unsigned char *)strechsrc); -#else - uint8x8x4_t s_argb8 = vld4_u8((unsigned char *)src); -#endif - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); -#ifdef BLEND_WITH_OPACITY - { - uint16x8_t s_a16 = vmull_u8(s_argb8.val[3], opa8); - uint16x8_t s_r16 = vmull_u8(s_argb8.val[2], opa8); - uint16x8_t s_g16 = vmull_u8(s_argb8.val[1], opa8); - uint16x8_t s_b16 = vmull_u8(s_argb8.val[0], opa8); - s_argb8.val[3] = vshrn_n_u16(s_a16, 8); - s_argb8.val[2] = vshrn_n_u16(s_r16, 8); - s_argb8.val[1] = vshrn_n_u16(s_g16, 8); - s_argb8.val[0] = vshrn_n_u16(s_b16, 8); - } -#endif -#ifdef BLEND_WITH_ADDALPHA - { - //Da = Sa + Da - SaDa - uint16x8_t d_a16 = vmull_u8(s_argb8.val[3], d_argb8.val[3]); - uint16x8_t t = vaddl_u8(s_argb8.val[3], d_argb8.val[3]); - s_argb8.val[3] = vmvn_u8(s_argb8.val[3]); // 1 - a - d_a16 = vsubq_u16(t, vshrq_n_u16(d_a16, 8)); - d_argb8.val[3] = vmovn_u16(vsubq_u16(d_a16, vshrq_n_u16(d_a16, 8))); - } -#else - s_argb8.val[3] = vmvn_u8(s_argb8.val[3]); // 1 - a -#endif - // s + d * (1 - sa) - uint16x8_t d_r16 = vmull_u8(d_argb8.val[2], s_argb8.val[3]); - uint16x8_t d_g16 = vmull_u8(d_argb8.val[1], s_argb8.val[3]); - uint16x8_t d_b16 = vmull_u8(d_argb8.val[0], s_argb8.val[3]); - - // 8-bit to do saturated add - d_argb8.val[2] = vqadd_u8(vshrn_n_u16(d_r16, 8), s_argb8.val[2]); - d_argb8.val[1] = vqadd_u8(vshrn_n_u16(d_g16, 8), s_argb8.val[1]); - d_argb8.val[0] = vqadd_u8(vshrn_n_u16(d_b16, 8), s_argb8.val[0]); - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); - -#ifdef STRECH_FUNC - srcstart += srcstep * 8; -#elif defined(LINEAR_TRANS_FUNC) - sx += stepx * 8; - sy += stepy * 8; -#else - src += 8; -#endif - dest += 8; - } - - if(dest < pEndDst) { - C_FUNC_NAME(dest, -#ifdef STRECH_FUNC - pEndDst - dest, src, srcstart, srcstep -#elif defined(LINEAR_TRANS_FUNC) - pEndDst - dest, src, sx, sy, stepx, stepy, srcpitch -#else - src, pEndDst - dest -#endif -#ifdef BLEND_WITH_OPACITY - , opa -#endif - ); - } -} \ No newline at end of file diff --git a/src/core/visual/ARM/alphablend.h b/src/core/visual/ARM/alphablend.h deleted file mode 100644 index 39b1084a..00000000 --- a/src/core/visual/ARM/alphablend.h +++ /dev/null @@ -1,175 +0,0 @@ -static void _CAT_NAME(FUNC_NAME, _NEON)(tjs_uint32 *dest, -#ifdef STRECH_FUNC - tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep -#elif defined(LINEAR_TRANS_FUNC) - tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch -#else - const tjs_uint32 *src, tjs_int len -#endif -#ifdef BLEND_WITH_OPACITY - , tjs_int opa -#endif - ) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - C_FUNC_NAME(dest, -#ifdef STRECH_FUNC - PreFragLen, src, srcstart, srcstep -#elif defined(LINEAR_TRANS_FUNC) - PreFragLen, src, sx, sy, stepx, stepy, srcpitch -#else - src, PreFragLen -#endif -#ifdef BLEND_WITH_OPACITY - , opa -#endif - ); - dest += PreFragLen; -#ifdef STRECH_FUNC - srcstart += PreFragLen * srcstep; -#elif defined(LINEAR_TRANS_FUNC) - sx += stepx * PreFragLen; - sy += stepy * PreFragLen; -#else - src += PreFragLen; -#endif - } - } - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; -#if defined(STRECH_FUNC) || defined(LINEAR_TRANS_FUNC) - unsigned char strechbuff[32 + 16]; - tjs_uint32 *strechsrc = __builtin_assume_aligned((tjs_uint32*)((((intptr_t)strechbuff) + 15) & ~15), 16); -#endif -#ifdef BLEND_WITH_OPACITY - uint8x8_t opa8 = vdup_n_u8(opa); -#endif -#ifdef BLEND_WITH_DEST_ALPHA - unsigned char tmpbuff[32 + 16]; - unsigned short *tmpsa = __builtin_assume_aligned((unsigned short*)((((intptr_t)tmpbuff) + 15) & ~15), 16); - unsigned char *tmpa = __builtin_assume_aligned((unsigned char*)(tmpsa + 8), 16); -#ifdef BLEND_WITH_OPACITY - uint16x8_t soparev = vdupq_n_u16(0x00ff); -#endif -#endif - while(dest < pVecEndDst) { -#if defined(STRECH_FUNC) || defined(LINEAR_TRANS_FUNC) - for(int i = 0; i < 8; ++i) { -#ifdef STRECH_FUNC - strechsrc[i] = src[(srcstart) >> 16]; - srcstart += srcstep; -#elif defined(LINEAR_TRANS_FUNC) - strechsrc[i] = *( (const tjs_uint32*)((const tjs_uint8*)src + ((sy)>>16)*srcpitch) + ((sx)>>16)); - sx += stepx; - sy += stepy; -#endif - } - uint8x8x4_t s_argb8 = vld4_u8((unsigned char *)strechsrc); -#else - //__builtin_prefetch(src, 0, 0); - uint8x8x4_t s_argb8 = vld4_u8((unsigned char *)src); -#endif - //__builtin_prefetch(dest, 0, 0); - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); -#ifdef BLEND_WITH_OPACITY - uint16x8_t s_a16 = vmull_u8(s_argb8.val[3], opa8); - s_a16 = vshrq_n_u16(s_a16, 8); -#else - uint16x8_t s_a16 = vmovl_u8(s_argb8.val[3]); -#endif -#ifdef BLEND_WITH_ADDALPHA - { -#ifdef BLEND_WITH_OPACITY - s_argb8.val[3] = vmovn_u16(s_a16); -#endif - uint16x8_t d_a16 = vmull_u8(s_argb8.val[3], d_argb8.val[3]); - uint16x8_t s_r16 = vmull_u8(s_argb8.val[2], s_argb8.val[3]); - uint16x8_t s_g16 = vmull_u8(s_argb8.val[1], s_argb8.val[3]); - uint16x8_t s_b16 = vmull_u8(s_argb8.val[0], s_argb8.val[3]); - //Da = Sa + Da - SaDa - d_a16 = vsubq_u16(vaddl_u8(s_argb8.val[3], d_argb8.val[3]), vshrq_n_u16(d_a16, 8)); - d_argb8.val[3] = vmovn_u16(vsubq_u16(d_a16, vshrq_n_u16(d_a16, 8))); - s_argb8.val[2] = vshrn_n_u16(s_r16, 8); - s_argb8.val[1] = vshrn_n_u16(s_g16, 8); - s_argb8.val[0] = vshrn_n_u16(s_b16, 8); - } - - // Di = sat(Si, (1-Sa)*Di) - s_argb8.val[3] = vmvn_u8(s_argb8.val[3]); - - uint16x8_t d_r16 = vmull_u8(d_argb8.val[2], s_argb8.val[3]); - uint16x8_t d_g16 = vmull_u8(d_argb8.val[1], s_argb8.val[3]); - uint16x8_t d_b16 = vmull_u8(d_argb8.val[0], s_argb8.val[3]); - - // 8-bit to do saturated add - d_argb8.val[2] = vqadd_u8(s_argb8.val[2], vshrn_n_u16(d_r16, 8)); - d_argb8.val[1] = vqadd_u8(s_argb8.val[1], vshrn_n_u16(d_g16, 8)); - d_argb8.val[0] = vqadd_u8(s_argb8.val[0], vshrn_n_u16(d_b16, 8)); -#else // BLEND_WITH_ADDALPHA -#ifdef BLEND_WITH_DEST_ALPHA - //( 255 - (255-a)*(255-b)/ 255 ); -#ifdef BLEND_WITH_OPACITY - uint16x8_t isd_a16 = vmulq_u16(veorq_u16(s_a16, soparev), vmovl_u8(vmvn_u8(d_argb8.val[3]))); -#else - uint16x8_t isd_a16 = vmull_u8(vmvn_u8(s_argb8.val[3]), vmvn_u8(d_argb8.val[3])); -#endif - -#ifdef BLEND_WITH_OPACITY - uint16x8_t sopa = vorrq_u16(vshlq_n_u16(s_a16, 8), vmovl_u8(d_argb8.val[3])); -#else - uint16x8_t sopa = vorrq_u16(vshll_n_u8(s_argb8.val[3], 8), vmovl_u8(d_argb8.val[3])); -#endif - d_argb8.val[3] = vmvn_u8(vshrn_n_u16(isd_a16, 8)); - vst1q_u16(tmpsa, sopa); - tmpa[0] = TVPOpacityOnOpacityTable[tmpsa[0]]; - tmpa[1] = TVPOpacityOnOpacityTable[tmpsa[1]]; - tmpa[2] = TVPOpacityOnOpacityTable[tmpsa[2]]; - tmpa[3] = TVPOpacityOnOpacityTable[tmpsa[3]]; - tmpa[4] = TVPOpacityOnOpacityTable[tmpsa[4]]; - tmpa[5] = TVPOpacityOnOpacityTable[tmpsa[5]]; - tmpa[6] = TVPOpacityOnOpacityTable[tmpsa[6]]; - tmpa[7] = TVPOpacityOnOpacityTable[tmpsa[7]]; - s_a16 = vmovl_u8(vld1_u8(tmpa)); -#endif - - // d + (s - d) * sa - uint16x8_t d_r16 = vsubl_u8(s_argb8.val[2], d_argb8.val[2]); - uint16x8_t d_g16 = vsubl_u8(s_argb8.val[1], d_argb8.val[1]); - uint16x8_t d_b16 = vsubl_u8(s_argb8.val[0], d_argb8.val[0]); - - d_r16 = vmulq_u16(d_r16, s_a16); - d_g16 = vmulq_u16(d_g16, s_a16); - d_b16 = vmulq_u16(d_b16, s_a16); - - d_argb8.val[2] = vadd_u8(d_argb8.val[2], vshrn_n_u16(d_r16, 8)); - d_argb8.val[1] = vadd_u8(d_argb8.val[1], vshrn_n_u16(d_g16, 8)); - d_argb8.val[0] = vadd_u8(d_argb8.val[0], vshrn_n_u16(d_b16, 8)); -#endif //BLEND_WITH_ADDALPHA - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); - -#ifdef STRECH_FUNC -#elif defined(LINEAR_TRANS_FUNC) -#else - src += 8; -#endif - dest += 8; - } - - if(dest < pEndDst) { - C_FUNC_NAME(dest, -#ifdef STRECH_FUNC - pEndDst - dest, src, srcstart, srcstep -#elif defined(LINEAR_TRANS_FUNC) - pEndDst - dest, src, sx, sy, stepx, stepy, srcpitch -#else - src, pEndDst - dest -#endif -#ifdef BLEND_WITH_OPACITY - , opa -#endif - ); - } -} \ No newline at end of file diff --git a/src/core/visual/ARM/alphablend2.h b/src/core/visual/ARM/alphablend2.h deleted file mode 100644 index 4f3724dd..00000000 --- a/src/core/visual/ARM/alphablend2.h +++ /dev/null @@ -1,147 +0,0 @@ -static void _CAT_NAME(FUNC_NAME, _NEON)(tjs_uint32 *dest, const tjs_uint32 *src1, const tjs_uint32 *src2, -#if defined(UNIV_TRANS) - const tjs_uint8 *rule, const tjs_uint32 *table, tjs_int len -#ifdef UNIV_TRANS_SWITCH - , tjs_int src1lv, tjs_int src2lv -#endif -#endif - ) -{ - tjs_uint32* pEndDst = dest + len; - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - C_FUNC_NAME(dest, src1, src2, -#if defined(UNIV_TRANS) - rule, table, PreFragLen -#ifdef UNIV_TRANS_SWITCH - , src1lv, src2lv -#endif -#endif - ); - dest += PreFragLen; - src1 += PreFragLen; - src2 += PreFragLen; -#ifdef UNIV_TRANS - rule += PreFragLen; -#endif - } - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; -#ifdef BLEND_WITH_DEST_ALPHA - unsigned char tmpbuff[32 + 16]; - unsigned short *tmpa = __builtin_assume_aligned((unsigned short*)((((intptr_t)tmpbuff) + 15) & ~15), 16); - unsigned char *tmpd = __builtin_assume_aligned((unsigned char *)(tmpa + 8), 16); - uint16x8_t const256 = vdupq_n_u16(256); -#endif -#ifdef UNIV_TRANS -#ifdef UNIV_TRANS_SWITCH - unsigned char opabuff[16 + 16]; - unsigned short *tmpo = __builtin_assume_aligned((unsigned short*)((((intptr_t)opabuff) + 15) & ~15), 16); -#endif -#endif - while(dest < pVecEndDst) { -#ifdef UNIV_TRANS - uint16x8_t o16; -#ifdef UNIV_TRANS_SWITCH - for (int i = 0; i < 8; ++i) { - tjs_int opa = *rule++; - if(opa >= src1lv) { - tmpo[i] = 0; - } else if(opa < src2lv) { - tmpo[i] = 255; - } else { - tmpo[i] = table[opa]; - } - o16 = vld1q_u16(tmpo); - } -#else - o16 = vsetq_lane_u16(table[*rule++], o16, 0); - o16 = vsetq_lane_u16(table[*rule++], o16, 1); - o16 = vsetq_lane_u16(table[*rule++], o16, 2); - o16 = vsetq_lane_u16(table[*rule++], o16, 3); - o16 = vsetq_lane_u16(table[*rule++], o16, 4); - o16 = vsetq_lane_u16(table[*rule++], o16, 5); - o16 = vsetq_lane_u16(table[*rule++], o16, 6); - o16 = vsetq_lane_u16(table[*rule++], o16, 7); -#endif -#endif - uint8x8x4_t s1 = vld4_u8((unsigned char *)src1); -#ifdef BLEND_WITH_DEST_ALPHA - uint16x8_t s1_a16 = vmulq_u16(vmovl_u8(s1.val[3]), vsubq_u16(const256, o16)); // a1*(256-opa) -#endif - uint8x8x4_t s2 = vld4_u8((unsigned char *)src2); -#ifdef BLEND_WITH_DEST_ALPHA - uint16x8_t d_a16 = vmulq_u16(vsubl_u8(s2.val[3], s1.val[3]), o16); - o16 = vmulq_u16(vmovl_u8(s2.val[3]), o16); - o16 = vsriq_n_u16(o16, s1_a16, 8); // addr - s2.val[3] = vadd_u8(s1.val[3], vshrn_n_u16(d_a16, 8)); - vst1q_u16(tmpa, o16); -#ifdef UNIV_TRANS_SWITCH - tmpd[0] = TVPNegativeMulTable[tmpa[0]]; - tmpd[1] = TVPNegativeMulTable[tmpa[1]]; - tmpd[2] = TVPNegativeMulTable[tmpa[2]]; - tmpd[3] = TVPNegativeMulTable[tmpa[3]]; - tmpd[4] = TVPNegativeMulTable[tmpa[4]]; - tmpd[5] = TVPNegativeMulTable[tmpa[5]]; - tmpd[6] = TVPNegativeMulTable[tmpa[6]]; - tmpd[7] = TVPNegativeMulTable[tmpa[7]]; -#endif -// o16 = vsetq_lane_u16(TVPOpacityOnOpacityTable[tmpa[0]], o16, 0); -// o16 = vsetq_lane_u16(TVPOpacityOnOpacityTable[tmpa[1]], o16, 1); -// o16 = vsetq_lane_u16(TVPOpacityOnOpacityTable[tmpa[2]], o16, 2); -// o16 = vsetq_lane_u16(TVPOpacityOnOpacityTable[tmpa[3]], o16, 3); -// o16 = vsetq_lane_u16(TVPOpacityOnOpacityTable[tmpa[4]], o16, 4); -// o16 = vsetq_lane_u16(TVPOpacityOnOpacityTable[tmpa[5]], o16, 5); -// o16 = vsetq_lane_u16(TVPOpacityOnOpacityTable[tmpa[6]], o16, 6); -// o16 = vsetq_lane_u16(TVPOpacityOnOpacityTable[tmpa[7]], o16, 7); - - tmpa[0] = TVPOpacityOnOpacityTable[tmpa[0]]; - tmpa[1] = TVPOpacityOnOpacityTable[tmpa[1]]; - tmpa[2] = TVPOpacityOnOpacityTable[tmpa[2]]; - tmpa[3] = TVPOpacityOnOpacityTable[tmpa[3]]; - tmpa[4] = TVPOpacityOnOpacityTable[tmpa[4]]; - tmpa[5] = TVPOpacityOnOpacityTable[tmpa[5]]; - tmpa[6] = TVPOpacityOnOpacityTable[tmpa[6]]; - tmpa[7] = TVPOpacityOnOpacityTable[tmpa[7]]; - o16 = vld1q_u16(tmpa); -#endif - // s1 * o + s2 * (1 - o) => s1 + (s2 - s1) * o -#ifdef BLEND_WITH_ADDALPHA - uint16x8_t d_a16 = vmulq_u16(vsubl_u8(s2.val[3], s1.val[3]), o16); -#endif - uint16x8_t d_r16 = vmulq_u16(vsubl_u8(s2.val[2], s1.val[2]), o16); - uint16x8_t d_g16 = vmulq_u16(vsubl_u8(s2.val[1], s1.val[1]), o16); - uint16x8_t d_b16 = vmulq_u16(vsubl_u8(s2.val[0], s1.val[0]), o16); - - // s * o >> 8 -#ifdef BLEND_WITH_ADDALPHA - s2.val[3] = vadd_u8(s1.val[3], vshrn_n_u16(d_a16, 8)); -#else - //s2.val[3] = vdup_n_u8(0); -#endif -#ifdef BLEND_WITH_DEST_ALPHA -#ifdef UNIV_TRANS_SWITCH - s2.val[3] = vld1_u8(tmpd); -#endif -#endif - s2.val[2] = vadd_u8(s1.val[2], vshrn_n_u16(d_r16, 8)); - s2.val[1] = vadd_u8(s1.val[1], vshrn_n_u16(d_g16, 8)); - s2.val[0] = vadd_u8(s1.val[0], vshrn_n_u16(d_b16, 8)); - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), s2); - - src1 += 8; - src2 += 8; - dest += 8; - } - - if(dest < pEndDst) { - C_FUNC_NAME(dest, src1, src2, -#if defined(UNIV_TRANS) - rule, table, pEndDst - dest -#ifdef UNIV_TRANS_SWITCH - , src1lv, src2lv -#endif -#endif - ); - } -} \ No newline at end of file diff --git a/src/core/visual/ARM/ps_addblend.h b/src/core/visual/ARM/ps_addblend.h deleted file mode 100644 index fe12ebdb..00000000 --- a/src/core/visual/ARM/ps_addblend.h +++ /dev/null @@ -1,7 +0,0 @@ -// s = sat(s, d) -{ - s.val[2] = vqadd_u8(s.val[2], d.val[2]); - s.val[1] = vqadd_u8(s.val[1], d.val[1]); - s.val[0] = vqadd_u8(s.val[0], d.val[0]); -} -#include "ps_alphablend.h" \ No newline at end of file diff --git a/src/core/visual/ARM/ps_alphablend.h b/src/core/visual/ARM/ps_alphablend.h deleted file mode 100644 index cad841d5..00000000 --- a/src/core/visual/ARM/ps_alphablend.h +++ /dev/null @@ -1,13 +0,0 @@ -// d + (s - d) * sa -{ - uint16x8_t d_r16 = vsubl_u8(s.val[2], d.val[2]); - uint16x8_t d_g16 = vsubl_u8(s.val[1], d.val[1]); - uint16x8_t d_b16 = vsubl_u8(s.val[0], d.val[0]); - d_r16 = vmulq_u16(d_r16, a); - d_g16 = vmulq_u16(d_g16, a); - d_b16 = vmulq_u16(d_b16, a); - - s.val[2] = vadd_u8(d.val[2], vshrn_n_u16(d_r16, 8)); - s.val[1] = vadd_u8(d.val[1], vshrn_n_u16(d_g16, 8)); - s.val[0] = vadd_u8(d.val[0], vshrn_n_u16(d_b16, 8)); -} \ No newline at end of file diff --git a/src/core/visual/ARM/ps_darkenblend.h b/src/core/visual/ARM/ps_darkenblend.h deleted file mode 100644 index d896d359..00000000 --- a/src/core/visual/ARM/ps_darkenblend.h +++ /dev/null @@ -1,6 +0,0 @@ -{ - s.val[2] = vmin_u8(s.val[2], d.val[2]); - s.val[1] = vmin_u8(s.val[1], d.val[1]); - s.val[0] = vmin_u8(s.val[0], d.val[0]); -} -#include "ps_alphablend.h" \ No newline at end of file diff --git a/src/core/visual/ARM/ps_diff5blend.h b/src/core/visual/ARM/ps_diff5blend.h deleted file mode 100644 index a11d6403..00000000 --- a/src/core/visual/ARM/ps_diff5blend.h +++ /dev/null @@ -1,6 +0,0 @@ -#include "ps_fadesrc.h" -{ - s.val[2] = vabd_u8(s.val[2], d.val[2]); - s.val[1] = vabd_u8(s.val[1], d.val[1]); - s.val[0] = vabd_u8(s.val[0], d.val[0]); -} diff --git a/src/core/visual/ARM/ps_diffblend.h b/src/core/visual/ARM/ps_diffblend.h deleted file mode 100644 index 9d6e3534..00000000 --- a/src/core/visual/ARM/ps_diffblend.h +++ /dev/null @@ -1,6 +0,0 @@ -{ - s.val[2] = vabd_u8(s.val[2], d.val[2]); - s.val[1] = vabd_u8(s.val[1], d.val[1]); - s.val[0] = vabd_u8(s.val[0], d.val[0]); -} -#include "ps_alphablend.h" \ No newline at end of file diff --git a/src/core/visual/ARM/ps_exclusionblend.h b/src/core/visual/ARM/ps_exclusionblend.h deleted file mode 100644 index 20757729..00000000 --- a/src/core/visual/ARM/ps_exclusionblend.h +++ /dev/null @@ -1,15 +0,0 @@ -// c = ((s+d-(s*d*2)/255)-d)*a + d = (s-(s*d*2)/255)*a + d -{ - uint16x8_t d_r16 = vmull_u8(s.val[2], d.val[2]); - uint16x8_t d_g16 = vmull_u8(s.val[1], d.val[1]); - uint16x8_t d_b16 = vmull_u8(s.val[0], d.val[0]); - d_r16 = vsubq_u16(vmovl_u8(s.val[2]), vshrq_n_u16(d_r16, 7)); - d_g16 = vsubq_u16(vmovl_u8(s.val[1]), vshrq_n_u16(d_g16, 7)); - d_b16 = vsubq_u16(vmovl_u8(s.val[0]), vshrq_n_u16(d_b16, 7)); - d_r16 = vmulq_u16(d_r16, a); - d_g16 = vmulq_u16(d_g16, a); - d_b16 = vmulq_u16(d_b16, a); - s.val[2] = vadd_u8(d.val[2], vshrn_n_u16(d_r16, 8)); - s.val[1] = vadd_u8(d.val[1], vshrn_n_u16(d_g16, 8)); - s.val[0] = vadd_u8(d.val[0], vshrn_n_u16(d_b16, 8)); -} diff --git a/src/core/visual/ARM/ps_fadesrc.h b/src/core/visual/ARM/ps_fadesrc.h deleted file mode 100644 index 7fa567da..00000000 --- a/src/core/visual/ARM/ps_fadesrc.h +++ /dev/null @@ -1,10 +0,0 @@ -// s = s * a -{ - uint16x8_t s_r16 = vmulq_u16(vmovl_u8(s.val[2]), a); - uint16x8_t s_g16 = vmulq_u16(vmovl_u8(s.val[1]), a); - uint16x8_t s_b16 = vmulq_u16(vmovl_u8(s.val[0]), a); - - s.val[2] = vshrn_n_u16(s_r16, 8); - s.val[1] = vshrn_n_u16(s_g16, 8); - s.val[0] = vshrn_n_u16(s_b16, 8); -} \ No newline at end of file diff --git a/src/core/visual/ARM/ps_hardlightblend.h b/src/core/visual/ARM/ps_hardlightblend.h deleted file mode 100644 index 829a8542..00000000 --- a/src/core/visual/ARM/ps_hardlightblend.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifdef _LOCAL_PROC_OVERLAY -sa = vmull_u8(vorr_u8(d.val[_I], mask1), s.val[_I]); -n = vtst_u8(s.val[_I], mask80); // n = d>=128 -d1 = vand_u8(vand_u8(d.val[_I], n), maskFE), s1 = vand_u8(s.val[_I], n); -sa = vshrq_n_u16(sa, 7); -t = vshll_n_u8(vadd_u8(s1, d1), 1); -t = vsubw_u8(t, n); -t = vsubq_u16(t, sa); -s.val[_I] = vand_u8(vmovn_u16(t), n); -s.val[_I] = vorr_u8(s.val[_I], vand_u8(vmovn_u16(sa), vmvn_u8(n))); -#else -{ -#define _LOCAL_PROC_OVERLAY - uint16x8_t sa, t; uint8x8_t n, s1, d1; -#define _I 0 -#include "ps_hardlightblend.h" -#undef _I -#define _I 1 -#include "ps_hardlightblend.h" -#undef _I -#define _I 2 -#include "ps_hardlightblend.h" -#undef _I -#undef _LOCAL_PROC_OVERLAY -} -#include "ps_alphablend.h" -#endif diff --git a/src/core/visual/ARM/ps_lightenblend.h b/src/core/visual/ARM/ps_lightenblend.h deleted file mode 100644 index 748a4bfc..00000000 --- a/src/core/visual/ARM/ps_lightenblend.h +++ /dev/null @@ -1,6 +0,0 @@ -{ - s.val[2] = vmax_u8(s.val[2], d.val[2]); - s.val[1] = vmax_u8(s.val[1], d.val[1]); - s.val[0] = vmax_u8(s.val[0], d.val[0]); -} -#include "ps_alphablend.h" \ No newline at end of file diff --git a/src/core/visual/ARM/ps_mulblend.h b/src/core/visual/ARM/ps_mulblend.h deleted file mode 100644 index c7f6aa23..00000000 --- a/src/core/visual/ARM/ps_mulblend.h +++ /dev/null @@ -1,9 +0,0 @@ -{ - uint16x8_t d_r16 = vmull_u8(s.val[2], d.val[2]); - uint16x8_t d_g16 = vmull_u8(s.val[1], d.val[1]); - uint16x8_t d_b16 = vmull_u8(s.val[0], d.val[0]); - s.val[2] = vshrn_n_u16(d_r16, 8); - s.val[1] = vshrn_n_u16(d_g16, 8); - s.val[0] = vshrn_n_u16(d_b16, 8); -} -#include "ps_alphablend.h" \ No newline at end of file diff --git a/src/core/visual/ARM/ps_overlayblend.h b/src/core/visual/ARM/ps_overlayblend.h deleted file mode 100644 index 523b230c..00000000 --- a/src/core/visual/ARM/ps_overlayblend.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifdef _LOCAL_PROC_OVERLAY -sa = vmull_u8(vorr_u8(s.val[_I], mask1), d.val[_I]); -n = vtst_u8(d.val[_I], mask80); // n = d>=128 -d1 = vand_u8(d.val[_I], n), s1 = vand_u8(vand_u8(s.val[_I], n), maskFE); -sa = vshrq_n_u16(sa, 7); -t = vshll_n_u8(vadd_u8(s1, d1), 1); -t = vsubw_u8(t, n); -t = vsubq_u16(t, sa); -s.val[_I] = vand_u8(vmovn_u16(t), n); -s.val[_I] = vorr_u8(s.val[_I], vand_u8(vmovn_u16(sa), vmvn_u8(n))); -#else -{ -#define _LOCAL_PROC_OVERLAY - uint16x8_t sa, t; uint8x8_t n, s1, d1; -#define _I 0 -#include "ps_overlayblend.h" -#undef _I -#define _I 1 -#include "ps_overlayblend.h" -#undef _I -#define _I 2 -#include "ps_overlayblend.h" -#undef _I -#undef _LOCAL_PROC_OVERLAY -} -#include "ps_alphablend.h" -#endif diff --git a/src/core/visual/ARM/ps_screenblend.h b/src/core/visual/ARM/ps_screenblend.h deleted file mode 100644 index 49be3823..00000000 --- a/src/core/visual/ARM/ps_screenblend.h +++ /dev/null @@ -1,15 +0,0 @@ -// c = ((s+d-(s*d)/255)-d)*a + d = (s-(s*d)/255)*a + d -{ - uint16x8_t d_r16 = vmull_u8(s.val[2], d.val[2]); - uint16x8_t d_g16 = vmull_u8(s.val[1], d.val[1]); - uint16x8_t d_b16 = vmull_u8(s.val[0], d.val[0]); - d_r16 = vsubq_u16(vmovl_u8(s.val[2]), vshrq_n_u16(d_r16, 8)); - d_g16 = vsubq_u16(vmovl_u8(s.val[1]), vshrq_n_u16(d_g16, 8)); - d_b16 = vsubq_u16(vmovl_u8(s.val[0]), vshrq_n_u16(d_b16, 8)); - d_r16 = vmulq_u16(d_r16, a); - d_g16 = vmulq_u16(d_g16, a); - d_b16 = vmulq_u16(d_b16, a); - s.val[2] = vadd_u8(d.val[2], vshrn_n_u16(d_r16, 8)); - s.val[1] = vadd_u8(d.val[1], vshrn_n_u16(d_g16, 8)); - s.val[0] = vadd_u8(d.val[0], vshrn_n_u16(d_b16, 8)); -} diff --git a/src/core/visual/ARM/ps_subblend.h b/src/core/visual/ARM/ps_subblend.h deleted file mode 100644 index f7d95009..00000000 --- a/src/core/visual/ARM/ps_subblend.h +++ /dev/null @@ -1,7 +0,0 @@ -// s <- d - (1 - s) -{ - s.val[2] = vqsub_u8(d.val[2], vmvn_u8(s.val[2])); - s.val[1] = vqsub_u8(d.val[1], vmvn_u8(s.val[1])); - s.val[0] = vqsub_u8(d.val[0], vmvn_u8(s.val[0])); -} -#include "ps_alphablend.h" \ No newline at end of file diff --git a/src/core/visual/ARM/psblend.h b/src/core/visual/ARM/psblend.h deleted file mode 100644 index b7aa794b..00000000 --- a/src/core/visual/ARM/psblend.h +++ /dev/null @@ -1,54 +0,0 @@ -static void _CAT_NAME(FUNC_NAME, _NEON)(tjs_uint32 *dest, - const tjs_uint32 *src, tjs_int len -#ifdef BLEND_WITH_OPACITY - , tjs_int opa -#endif - ) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - C_FUNC_NAME(dest, src, PreFragLen -#ifdef BLEND_WITH_OPACITY - , opa -#endif - ); - dest += PreFragLen; - src += PreFragLen; - } - } - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; -#ifdef BLEND_WITH_OPACITY - uint8x8_t opa8 = vdup_n_u8(opa); -#endif -#ifdef TVPPS_PREPROC - TVPPS_PREPROC -#endif - while(dest < pVecEndDst) { - uint8x8x4_t s = vld4_u8((unsigned char *)src); -#ifdef BLEND_WITH_OPACITY - uint16x8_t a = vmull_u8(s.val[3], opa8); -#else - uint16x8_t a = vmovl_u8(s.val[3]); -#endif - uint8x8x4_t d = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); -#if defined(BLEND_WITH_OPACITY) - a = vshrq_n_u16(a, 8); -#endif -#include TVPPS_OPERATION - s.val[3] = d.val[3]; // hold alpha - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), s); - src += 8; - dest += 8; - } - - if(dest < pEndDst) { - C_FUNC_NAME(dest, src, pEndDst - dest -#ifdef BLEND_WITH_OPACITY - , opa -#endif - ); - } -} \ No newline at end of file diff --git a/src/core/visual/ARM/tvpgl_arm.c b/src/core/visual/ARM/tvpgl_arm.c deleted file mode 100644 index 973ec0ac..00000000 --- a/src/core/visual/ARM/tvpgl_arm.c +++ /dev/null @@ -1,3692 +0,0 @@ -#include "tvpgl_arm_intf.h" -#include "tvpgl_asm_init.h" -#include -#include -#include -#include - -//#define TEST_ARM_NEON_CODE -//#define DEBUG_ARM_NEON -//#define LOG_NEON_TEST - -#ifdef __cplusplus -#if defined(TEST_ARM_NEON_CODE) -#include -#endif -#include -extern "C" { -#endif -extern unsigned char TVPNegativeMulTable[256*256]; -extern unsigned char TVPOpacityOnOpacityTable[256*256]; -extern unsigned short TVPRecipTableForOpacityOnOpacity[256]; -#ifdef __cplusplus -}; -#endif - -#define __CAT_NAME(a, b) a##b -#define _CAT_NAME(a, b) __CAT_NAME(a, b) - -#ifndef Region_AlphaBlend - -#define FUNC_NAME TVPAlphaBlend -#define C_FUNC_NAME TVPAlphaBlend_HDA_c -#include "alphablend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPAlphaBlend_o -#define C_FUNC_NAME TVPAlphaBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "alphablend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -// static void TVPAlphaBlend_d_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { -// tjs_uint32* pEndDst = dest + len; -// { -// tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; -// if (PreFragLen > len) PreFragLen = len; -// if (PreFragLen) { -// TVPAlphaBlend_d_c(dest, src, PreFragLen); -// dest += PreFragLen; -// src += PreFragLen; -// } -// } -// tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7) - 7; -// unsigned char tmpbuff[32 + 16]; -// unsigned char *tmpa = __builtin_assume_aligned((unsigned char*)((((intptr_t)tmpbuff) + 15) & ~15), 16); -// unsigned short *tmpsa = __builtin_assume_aligned((unsigned short*)(tmpa + 8), 8); -// while (dest < pVecEndDst) { -// uint8x8x4_t s_argb8 = vld4_u8((unsigned char *)src); -// uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); -// -// //( 255 - (255-a)*(255-b)/ 255 ); -// uint16x8_t isd_a16 = vmull_u8(vmvn_u8(s_argb8.val[3]), vmvn_u8(d_argb8.val[3])); -// d_argb8.val[3] = vmvn_u8(vshrn_n_u16(isd_a16, 8)); -// -// uint16x8_t sopa = vorrq_u16(vshll_n_u8(s_argb8.val[3], 8), vmovl_u8(d_argb8.val[3])); -// vst1q_u16(tmpsa, sopa); -// tmpa[0] = TVPOpacityOnOpacityTable[tmpsa[0]]; -// tmpa[1] = TVPOpacityOnOpacityTable[tmpsa[1]]; -// tmpa[2] = TVPOpacityOnOpacityTable[tmpsa[2]]; -// tmpa[3] = TVPOpacityOnOpacityTable[tmpsa[3]]; -// tmpa[4] = TVPOpacityOnOpacityTable[tmpsa[4]]; -// tmpa[5] = TVPOpacityOnOpacityTable[tmpsa[5]]; -// tmpa[6] = TVPOpacityOnOpacityTable[tmpsa[6]]; -// tmpa[7] = TVPOpacityOnOpacityTable[tmpsa[7]]; -// uint16x8_t s_a16 = vmovl_u8(vld1_u8(tmpa)); -// -// // d + (s - d) * sa -// uint16x8_t d_r16 = vsubl_u8(s_argb8.val[2], d_argb8.val[2]); -// uint16x8_t d_g16 = vsubl_u8(s_argb8.val[1], d_argb8.val[1]); -// uint16x8_t d_b16 = vsubl_u8(s_argb8.val[0], d_argb8.val[0]); -// -// d_r16 = vmulq_u16(d_r16, s_a16); -// d_g16 = vmulq_u16(d_g16, s_a16); -// d_b16 = vmulq_u16(d_b16, s_a16); -// -// d_argb8.val[2] = vadd_u8(d_argb8.val[2], vshrn_n_u16(d_r16, 8)); -// d_argb8.val[1] = vadd_u8(d_argb8.val[1], vshrn_n_u16(d_g16, 8)); -// d_argb8.val[0] = vadd_u8(d_argb8.val[0], vshrn_n_u16(d_b16, 8)); -// -// vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); -// -// src += 8; -// dest += 8; -// } -// -// if (dest < pEndDst) { -// TVPAlphaBlend_d_c(dest, src, pEndDst - dest); -// } -// } - -#define FUNC_NAME TVPAlphaBlend_d -#define C_FUNC_NAME TVPAlphaBlend_d_c -#define BLEND_WITH_DEST_ALPHA -#include "alphablend.h" -#undef BLEND_WITH_DEST_ALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPAlphaBlend_a -#define C_FUNC_NAME TVPAlphaBlend_a_c -#define BLEND_WITH_ADDALPHA -#include "alphablend.h" -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPAlphaBlend_do -#define C_FUNC_NAME TVPAlphaBlend_do_c -#define BLEND_WITH_OPACITY -#define BLEND_WITH_DEST_ALPHA -#include "alphablend.h" -#undef BLEND_WITH_DEST_ALPHA -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPAlphaBlend_ao -#define C_FUNC_NAME TVPAlphaBlend_ao_c -#define BLEND_WITH_OPACITY -#define BLEND_WITH_ADDALPHA -#include "alphablend.h" -#undef BLEND_WITH_ADDALPHA -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#endif - -static void TVPAlphaColorMat_NEON(tjs_uint32 *dest, const tjs_uint32 color, tjs_int len) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPAlphaColorMat_c(dest, color, PreFragLen); - dest += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - uint8x8_t d_argb8[3]; - d_argb8[0] = vdup_n_u8(color & 0xff); - d_argb8[1] = vdup_n_u8((color >> 8) & 0xff); - d_argb8[2] = vdup_n_u8((color >> 16) & 0xff); - while(dest < pVecEndDst) { - uint8x8x4_t s_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - uint16x8_t s_a16 = vmovl_u8(s_argb8.val[3]); - - // d + (s - d) * sa - uint16x8_t d_r16 = vsubl_u8(s_argb8.val[2], d_argb8[2]); - uint16x8_t d_g16 = vsubl_u8(s_argb8.val[1], d_argb8[1]); - uint16x8_t d_b16 = vsubl_u8(s_argb8.val[0], d_argb8[0]); - d_r16 = vmulq_u16(d_r16, s_a16); - d_g16 = vmulq_u16(d_g16, s_a16); - d_b16 = vmulq_u16(d_b16, s_a16); - - // 8-bit to do saturated add - s_argb8.val[2] = vadd_u8(d_argb8[2], vshrn_n_u16(d_r16, 8)); - s_argb8.val[1] = vadd_u8(d_argb8[1], vshrn_n_u16(d_g16, 8)); - s_argb8.val[0] = vadd_u8(d_argb8[0], vshrn_n_u16(d_b16, 8)); - s_argb8.val[3] = vdup_n_u8(0xFF); - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), s_argb8); - dest += 8; - } - - if(dest < pEndDst) { - TVPAlphaColorMat_c(dest, color, pEndDst - dest); - } -} - -#ifndef Region_AdditiveAlphaBlend - -#define FUNC_NAME TVPAdditiveAlphaBlend -#define C_FUNC_NAME TVPAdditiveAlphaBlend_HDA_c -#include "addalphablend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPAdditiveAlphaBlend_o -#define C_FUNC_NAME TVPAdditiveAlphaBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "addalphablend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPAdditiveAlphaBlend_a -#define C_FUNC_NAME TVPAdditiveAlphaBlend_a_c -#define BLEND_WITH_ADDALPHA -#include "addalphablend.h" -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPAdditiveAlphaBlend_ao -#define C_FUNC_NAME TVPAdditiveAlphaBlend_ao_c -#define BLEND_WITH_OPACITY -#define BLEND_WITH_ADDALPHA -#include "addalphablend.h" -#undef BLEND_WITH_ADDALPHA -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#endif - -static void TVPConvertAlphaToAdditiveAlpha_NEON(tjs_uint32 *dest, tjs_int len) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPConvertAlphaToAdditiveAlpha_c(dest, PreFragLen); - dest += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - while(dest < pVecEndDst) { - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - uint16x8_t d_r16 = vmull_u8(d_argb8.val[2], d_argb8.val[3]); - uint16x8_t d_g16 = vmull_u8(d_argb8.val[1], d_argb8.val[3]); - uint16x8_t d_b16 = vmull_u8(d_argb8.val[0], d_argb8.val[3]); - d_argb8.val[2] = vshrn_n_u16(d_r16, 8); - d_argb8.val[1] = vshrn_n_u16(d_g16, 8); - d_argb8.val[0] = vshrn_n_u16(d_b16, 8); - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); - dest += 8; - } - if(dest < pEndDst) { - TVPConvertAlphaToAdditiveAlpha_c(dest, pEndDst - dest); - } -} - -#ifndef Region_StretchAlphaBlend -#define STRECH_FUNC -#define FUNC_NAME TVPStretchAlphaBlend -#define C_FUNC_NAME TVPStretchAlphaBlend_HDA_c -#include "alphablend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPStretchAlphaBlend_o -#define C_FUNC_NAME TVPStretchAlphaBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "alphablend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPStretchAlphaBlend_d -#define C_FUNC_NAME TVPStretchAlphaBlend_d_c -#define BLEND_WITH_DEST_ALPHA -#include "alphablend.h" -#undef BLEND_WITH_DEST_ALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPStretchAlphaBlend_a -#define C_FUNC_NAME TVPStretchAlphaBlend_a_c -#define BLEND_WITH_ADDALPHA -#include "alphablend.h" -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPStretchAlphaBlend_do -#define C_FUNC_NAME TVPStretchAlphaBlend_do_c -#define BLEND_WITH_OPACITY -#define BLEND_WITH_DEST_ALPHA -#include "alphablend.h" -#undef BLEND_WITH_DEST_ALPHA -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPStretchAlphaBlend_ao -#define C_FUNC_NAME TVPStretchAlphaBlend_ao_c -#define BLEND_WITH_OPACITY -#define BLEND_WITH_ADDALPHA -#include "alphablend.h" -#undef BLEND_WITH_ADDALPHA -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef STRECH_FUNC -#endif - -#ifndef Region_StretchAddAlphaBlend -#define STRECH_FUNC -#define FUNC_NAME TVPStretchAdditiveAlphaBlend -#define C_FUNC_NAME TVPStretchAdditiveAlphaBlend_HDA_c -#include "addalphablend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPInterpStretchAdditiveAlphaBlend -#define C_FUNC_NAME TVPInterpStretchAdditiveAlphaBlend_c -#define BLEND_WITH_ADDALPHA -#include "InterpTransBlend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPInterpStretchAdditiveAlphaBlend_o -#define C_FUNC_NAME TVPInterpStretchAdditiveAlphaBlend_o_c -#define BLEND_WITH_OPACITY -#include "InterpTransBlend.h" -#undef BLEND_WITH_OPACITY -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPStretchAdditiveAlphaBlend_o -#define C_FUNC_NAME TVPStretchAdditiveAlphaBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "addalphablend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPStretchAdditiveAlphaBlend_a -#define C_FUNC_NAME TVPStretchAdditiveAlphaBlend_a_c -#define BLEND_WITH_ADDALPHA -#include "addalphablend.h" -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPStretchAdditiveAlphaBlend_ao -#define C_FUNC_NAME TVPStretchAdditiveAlphaBlend_ao_c -#define BLEND_WITH_OPACITY -#define BLEND_WITH_ADDALPHA -#include "addalphablend.h" -#undef BLEND_WITH_ADDALPHA -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#undef STRECH_FUNC -#endif - -#ifndef Region_LinTransAlphaBlend -#define LINEAR_TRANS_FUNC -#define FUNC_NAME TVPLinTransAlphaBlend -#define C_FUNC_NAME TVPLinTransAlphaBlend_HDA_c -#include "alphablend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPLinTransAlphaBlend_o -#define C_FUNC_NAME TVPLinTransAlphaBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "alphablend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPLinTransAlphaBlend_d -#define C_FUNC_NAME TVPLinTransAlphaBlend_d_c -#define BLEND_WITH_DEST_ALPHA -#include "alphablend.h" -#undef BLEND_WITH_DEST_ALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPLinTransAlphaBlend_a -#define C_FUNC_NAME TVPLinTransAlphaBlend_a_c -#define BLEND_WITH_ADDALPHA -#include "alphablend.h" -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPLinTransAlphaBlend_do -#define C_FUNC_NAME TVPLinTransAlphaBlend_do_c -#define BLEND_WITH_OPACITY -#define BLEND_WITH_DEST_ALPHA -#include "alphablend.h" -#undef BLEND_WITH_DEST_ALPHA -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPLinTransAlphaBlend_ao -#define C_FUNC_NAME TVPLinTransAlphaBlend_ao_c -#define BLEND_WITH_OPACITY -#define BLEND_WITH_ADDALPHA -#include "alphablend.h" -#undef BLEND_WITH_ADDALPHA -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#undef LINEAR_TRANS_FUNC -#endif - -#ifndef Region_LinTransAddAlphaBlend -#define LINEAR_TRANS_FUNC - -#define FUNC_NAME TVPLinTransAdditiveAlphaBlend -#define C_FUNC_NAME TVPLinTransAdditiveAlphaBlend_HDA_c -#include "addalphablend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPLinTransAdditiveAlphaBlend_o -#define C_FUNC_NAME TVPLinTransAdditiveAlphaBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "addalphablend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPLinTransAdditiveAlphaBlend_a -#define C_FUNC_NAME TVPLinTransAdditiveAlphaBlend_a_c -#define BLEND_WITH_ADDALPHA -#include "addalphablend.h" -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPLinTransAdditiveAlphaBlend_ao -#define C_FUNC_NAME TVPLinTransAdditiveAlphaBlend_ao_c -#define BLEND_WITH_OPACITY -#define BLEND_WITH_ADDALPHA -#include "addalphablend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPInterpLinTransAdditiveAlphaBlend -#define C_FUNC_NAME TVPInterpLinTransAdditiveAlphaBlend_c -#include "InterpTransBlend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPInterpLinTransAdditiveAlphaBlend_o -#define C_FUNC_NAME TVPInterpLinTransAdditiveAlphaBlend_o_c -#define BLEND_WITH_OPACITY -#include "InterpTransBlend.h" -#undef BLEND_WITH_OPACITY -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#undef LINEAR_TRANS_FUNC -#endif - -static void TVPCopyOpaqueImage_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) -{ - tjs_uint32* pEndDst = dest + len; - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPCopyOpaqueImage_c(dest, src, PreFragLen); - dest += PreFragLen; - src += PreFragLen; - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-15; - if (((intptr_t)src) & 7) { - while (dest < pVecEndDst) { - __builtin_prefetch(src, 0, 0); - uint8x16x4_t s0 = vld4q_u8(__builtin_assume_aligned((uint8_t *)(src), 4)); - s0.val[3] = vdupq_n_u8(0xFF); - vst4q_u8(__builtin_assume_aligned((uint8_t *)(dest), 8), s0); - src += 16; - dest += 16; - -// __builtin_prefetch(src, 0, 0); -// uint8x16x4_t s1 = vld4q_u8(__builtin_assume_aligned((uint8_t *)(src), 4)); -// s1.val[3] = vdupq_n_u8(0xFF); -// vst4q_u8(__builtin_assume_aligned((uint8_t *)(dest + 16), 8), s1); -// src += 16; -// dest += 16; - } - } else { - while (dest < pVecEndDst) { - __builtin_prefetch(src, 0, 0); - uint8x16x4_t s0 = vld4q_u8(__builtin_assume_aligned((uint8_t *)(src), 8)); - s0.val[3] = vdupq_n_u8(0xFF); - vst4q_u8(__builtin_assume_aligned((uint8_t *)(dest), 8), s0); - src += 16; - dest += 16; - -// __builtin_prefetch(src, 0, 0); -// uint8x16x4_t s1 = vld4q_u8(__builtin_assume_aligned((uint8_t *)(src), 8)); -// s1.val[3] = vdupq_n_u8(0xFF); -// vst4q_u8(__builtin_assume_aligned((uint8_t *)(dest + 16), 8), s1); -// src += 16; -// dest += 16; - } - } - - if(dest < pEndDst) { - TVPCopyOpaqueImage_c(dest, src, pEndDst - dest); - } -} - -#ifndef Region_ConstAlphaBlend - -#define FUNC_NAME TVPConstAlphaBlend -#define C_FUNC_NAME TVPConstAlphaBlend_HDA_c -#include "ConstAlphaBlend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPConstAlphaBlend_d -#define C_FUNC_NAME TVPConstAlphaBlend_d_c -#define BLEND_WITH_DEST_ALPHA -#include "ConstAlphaBlend.h" -#undef BLEND_WITH_DEST_ALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPConstAlphaBlend_a -#define C_FUNC_NAME TVPConstAlphaBlend_a_c -#define BLEND_WITH_ADDALPHA -#include "ConstAlphaBlend.h" -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define STRECH_FUNC -#define FUNC_NAME TVPStretchConstAlphaBlend -#define C_FUNC_NAME TVPStretchConstAlphaBlend_HDA_c -#include "ConstAlphaBlend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPStretchConstAlphaBlend_d -#define C_FUNC_NAME TVPStretchConstAlphaBlend_d_c -#define BLEND_WITH_DEST_ALPHA -#include "ConstAlphaBlend.h" -#undef BLEND_WITH_DEST_ALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPStretchConstAlphaBlend_a -#define C_FUNC_NAME TVPStretchConstAlphaBlend_a_c -#define BLEND_WITH_ADDALPHA -#include "ConstAlphaBlend.h" -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef STRECH_FUNC - -#define LINEAR_TRANS_FUNC - -#define FUNC_NAME TVPLinTransConstAlphaBlend -#define C_FUNC_NAME TVPLinTransConstAlphaBlend_HDA_c -#include "ConstAlphaBlend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPLinTransConstAlphaBlend_d -#define C_FUNC_NAME TVPLinTransConstAlphaBlend_d_c -#define BLEND_WITH_DEST_ALPHA -#include "ConstAlphaBlend.h" -#undef BLEND_WITH_DEST_ALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPLinTransConstAlphaBlend_a -#define C_FUNC_NAME TVPLinTransConstAlphaBlend_a_c -#define BLEND_WITH_ADDALPHA -#include "ConstAlphaBlend.h" -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#undef LINEAR_TRANS_FUNC - -#define FUNC_NAME TVPConstAlphaBlend_SD -#define C_FUNC_NAME TVPConstAlphaBlend_SD_c -#include "ConstAlphaBlend2.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPConstAlphaBlend_SD_a -#define C_FUNC_NAME TVPConstAlphaBlend_SD_a_c -#define BLEND_WITH_ADDALPHA -#include "ConstAlphaBlend2.h" -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPConstAlphaBlend_SD_d -#define C_FUNC_NAME TVPConstAlphaBlend_SD_d_c -#define BLEND_WITH_DEST_ALPHA -#include "ConstAlphaBlend2.h" -#undef BLEND_WITH_DEST_ALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#endif - -static void TVPStretchCopyOpaqueImage_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPStretchCopyOpaqueImage_c(dest, PreFragLen, src, srcstart, srcstep); - dest += PreFragLen; - srcstart += PreFragLen * srcstep; - } - } - - unsigned char strechbuff[64 + 16]; - tjs_uint32 *strechsrc = __builtin_assume_aligned((tjs_uint32*)((((intptr_t)strechbuff) + 7) & ~7), 8); - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-15; - while(dest < pVecEndDst) { - for(int i = 0; i < 16; ++i) { - strechsrc[i] = src[(srcstart) >> 16]; - srcstart += srcstep; - } - uint8x16x4_t s = vld4q_u8((uint8_t *)strechsrc); - s.val[3] = vdupq_n_u8(0xFF); - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), s); - dest += 16; - } - - if(dest < pEndDst) { - TVPStretchCopyOpaqueImage_c(dest, pEndDst - dest, src, srcstart, srcstep); - } -} - -void TVPLinTransCopyOpaqueImage_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPLinTransCopyOpaqueImage_c(dest, PreFragLen, src, sx, sy, stepx, stepy, srcpitch); - dest += PreFragLen; - sx += stepx * PreFragLen; - sy += stepy * PreFragLen; - } - } - - unsigned char strechbuff[64 + 16]; - tjs_uint32 *strechsrc = __builtin_assume_aligned((tjs_uint32*)((((intptr_t)strechbuff) + 7) & ~7), 8); - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-15; - while(dest < pVecEndDst) { - for(int i = 0; i < 16; ++i) { - strechsrc[i] = *( (const tjs_uint32*)((const tjs_uint8*)src + (sy>>16)*srcpitch) + (sx>>16)); - sx += stepx; - sy += stepy; - } - uint8x16x4_t s = vld4q_u8((uint8_t *)strechsrc); - s.val[3] = vdupq_n_u8(0xFF); - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), s); - dest += 16; - } - - if(dest < pEndDst) { - TVPLinTransCopyOpaqueImage_c(dest, pEndDst - dest, src, sx, sy, stepx, stepy, srcpitch); - } -} - -#ifndef Region_UnivTransBlend - -#define STRECH_FUNC -#define FUNC_NAME TVPInterpStretchConstAlphaBlend -#define C_FUNC_NAME TVPInterpStretchConstAlphaBlend_c -#define BLEND_WITH_OPACITY -#include "InterpTransBlend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef STRECH_FUNC - -#define LINEAR_TRANS_FUNC -#define FUNC_NAME TVPInterpLinTransConstAlphaBlend -#define C_FUNC_NAME TVPInterpLinTransConstAlphaBlend_c -#include "InterpTransBlend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef LINEAR_TRANS_FUNC - -#define UNIV_TRANS - -#define FUNC_NAME TVPUnivTransBlend -#define C_FUNC_NAME TVPUnivTransBlend_c -#include "alphablend2.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPUnivTransBlend_d -#define C_FUNC_NAME TVPUnivTransBlend_d_c -#define BLEND_WITH_DEST_ALPHA -#include "alphablend2.h" -#undef BLEND_WITH_DEST_ALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPUnivTransBlend_a -#define C_FUNC_NAME TVPUnivTransBlend_a_c -#define BLEND_WITH_ADDALPHA -#include "alphablend2.h" -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define UNIV_TRANS_SWITCH -#define FUNC_NAME TVPUnivTransBlend_switch -#define C_FUNC_NAME TVPUnivTransBlend_switch_c -#include "alphablend2.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPUnivTransBlend_switch_d -#define C_FUNC_NAME TVPUnivTransBlend_switch_d_c -#define BLEND_WITH_DEST_ALPHA -#include "alphablend2.h" -#undef BLEND_WITH_DEST_ALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPUnivTransBlend_switch_a -#define C_FUNC_NAME TVPUnivTransBlend_switch_a_c -#define BLEND_WITH_ADDALPHA -#include "alphablend2.h" -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#undef UNIV_TRANS_SWITCH -#undef UNIV_TRANS -#endif - -#ifndef Region_ApplyColorMap - -#define FUNC_NAME TVPApplyColorMap -#define C_FUNC_NAME TVPApplyColorMap_HDA_c -#include "ApplyColorMap.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPApplyColorMap_o -#define C_FUNC_NAME TVPApplyColorMap_HDA_o_c -#define BLEND_WITH_OPACITY -#include "ApplyColorMap.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPApplyColorMap_d -#define C_FUNC_NAME TVPApplyColorMap_d_c -#define BLEND_WITH_DEST_ALPHA -#include "ApplyColorMap.h" -#undef BLEND_WITH_DEST_ALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPApplyColorMap_a -#define C_FUNC_NAME TVPApplyColorMap_a_c -#define BLEND_WITH_ADDALPHA -#include "ApplyColorMap.h" -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPApplyColorMap_do -#define C_FUNC_NAME TVPApplyColorMap_do_c -#define BLEND_WITH_OPACITY -#define BLEND_WITH_DEST_ALPHA -#include "ApplyColorMap.h" -#undef BLEND_WITH_OPACITY -#undef BLEND_WITH_DEST_ALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPApplyColorMap_ao -#define C_FUNC_NAME TVPApplyColorMap_ao_c -#define BLEND_WITH_OPACITY -#define BLEND_WITH_ADDALPHA -#include "ApplyColorMap.h" -#undef BLEND_WITH_OPACITY -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#endif - -#ifndef Region_ConstColorAlphaBlend - -#define FUNC_NAME TVPConstColorAlphaBlend -#define C_FUNC_NAME TVPConstColorAlphaBlend_c -#include "ConstColorAlphaBlend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPConstColorAlphaBlend_d -#define C_FUNC_NAME TVPConstColorAlphaBlend_d_c -#define BLEND_WITH_DEST_ALPHA -#include "ConstColorAlphaBlend.h" -#undef BLEND_WITH_DEST_ALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPConstColorAlphaBlend_a -#define C_FUNC_NAME TVPConstColorAlphaBlend_a_c -#define BLEND_WITH_ADDALPHA -#include "ConstColorAlphaBlend.h" -#undef BLEND_WITH_ADDALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME - -#endif - -static void TVPRemoveConstOpacity_NEON(tjs_uint32 *dest, tjs_int len, tjs_int strength) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPRemoveConstOpacity_c(dest, PreFragLen, strength); - dest += PreFragLen; - } - } - - uint8x8_t istrength = vdup_n_u8(255 - strength); - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - while(dest < pVecEndDst) { - uint8x8x4_t d = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - d.val[3] = vshrn_n_u16(vmull_u8(d.val[3], istrength), 8); - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - dest += 8; - } - - if(dest < pEndDst) { - TVPRemoveConstOpacity_c(dest, pEndDst - dest, strength); - } -} - -static void TVPRemoveOpacity_NEON(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPRemoveOpacity_c(dest, src, PreFragLen); - dest += PreFragLen; - src += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - if (!(((intptr_t)src) & 7)) { - while (dest < pVecEndDst) { - uint8x8_t s = vld1_u8(__builtin_assume_aligned((uint8_t *)src, 8)); - uint8x8x4_t d = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - d.val[3] = vshrn_n_u16(vmull_u8(d.val[3], vmvn_u8(s)), 8); - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - dest += 8; - src += 8; - } - } - - if(dest < pEndDst) { - TVPRemoveOpacity_c(dest, src, pEndDst - dest); - } -} - -static void TVPRemoveOpacity_o_NEON(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len, tjs_int _strength) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPRemoveOpacity_o_c(dest, src, PreFragLen, _strength); - dest += PreFragLen; - src += PreFragLen; - } - } - - uint16x8_t strength = vdupq_n_u16(_strength > 127 ? _strength + 1 : _strength);/* adjust for error */ - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - if (!(((intptr_t)src) & 7)) { - while (dest < pVecEndDst) { - //d.rgb | (d.a * (65535 - s * str) >> 8) - uint16x8_t s16 = vmulq_u16(vmovl_u8(vld1_u8(__builtin_assume_aligned(src, 8))), strength); // s * str(8pix) - uint8x8x4_t d = vld4_u8(__builtin_assume_aligned(__builtin_assume_aligned((uint8_t *)dest, 8), 8)); // d (8pix) - s16 = vmvnq_u16(s16); // 65535 - s - s16 = vmull_u8(vshrn_n_u16(s16, 8), d.val[3]); // da * (65535 - s * str) - d.val[3] = vshrn_n_u16(s16, 8); - vst4_u8(__builtin_assume_aligned(__builtin_assume_aligned((uint8_t *)dest, 8), 8), d); - dest += 8; - src += 8; - } - } - - if(dest < pEndDst) { - TVPRemoveOpacity_o_c(dest, src, pEndDst - dest, _strength); - } -} - -#ifndef Region_AddBlend -#define OP_FUNC vqaddq_u8 -#define FUNC_NAME TVPAddBlend -#define C_FUNC_NAME TVPAddBlend_c -#include "AddBlend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPAddBlend_HDA -#define C_FUNC_NAME TVPAddBlend_HDA_c -#define HOLD_DEST_ALPHA -#include "AddBlend.h" -#undef HOLD_DEST_ALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef OP_FUNC - -#define OP_FUNC vqadd_u8 -#define FUNC_NAME TVPAddBlend_o -#define C_FUNC_NAME TVPAddBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "AddBlend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef OP_FUNC - -#endif - -#ifndef Region_SubBlend -#define SUB_FUNC -#define OP_FUNC vqsubq_u8 -#define FUNC_NAME TVPSubBlend -#define C_FUNC_NAME TVPSubBlend_c -#include "AddBlend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPSubBlend_HDA -#define C_FUNC_NAME TVPSubBlend_HDA_c -#define HOLD_DEST_ALPHA -#include "AddBlend.h" -#undef HOLD_DEST_ALPHA -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef OP_FUNC - -#define OP_FUNC vqsub_u8 -#define FUNC_NAME TVPSubBlend_o -#define C_FUNC_NAME TVPSubBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "AddBlend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef OP_FUNC -#undef SUB_FUNC -#endif - -#ifndef Region_MulBlend -#define FUNC_NAME TVPMulBlend_HDA -#define C_FUNC_NAME TVPMulBlend_HDA_c -#include "MulBlend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPMulBlend_HDA_o -#define C_FUNC_NAME TVPMulBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "MulBlend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define NON_HDA -#define FUNC_NAME TVPMulBlend -#define C_FUNC_NAME TVPMulBlend_c -#include "MulBlend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPMulBlend_o -#define C_FUNC_NAME TVPMulBlend_o_c -#define BLEND_WITH_OPACITY -#include "MulBlend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef NON_HDA -#endif - -static void TVPColorDodgeBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPColorDodgeBlend_HDA_c(dest, src, PreFragLen); - dest += PreFragLen; - src += PreFragLen; - } - } - - unsigned char tmpbuff[16 + 16 + 8]; - unsigned short *tmpa = __builtin_assume_aligned((unsigned short*)((((intptr_t)tmpbuff) + 7) & ~7), 8); - unsigned char* tmpb = __builtin_assume_aligned((unsigned char*)(tmpa + 8), 8); - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7) - 7; - while(dest < pVecEndDst) { - uint8x8x4_t s_argb8 = vld4_u8((unsigned char*)src); - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - - // d = d * 255 / (255 - s) - s_argb8.val[2] = vmvn_u8(s_argb8.val[2]); - s_argb8.val[1] = vmvn_u8(s_argb8.val[1]); - s_argb8.val[0] = vmvn_u8(s_argb8.val[0]); - - uint16x8_t tmp = vsubl_u8(s_argb8.val[2], d_argb8.val[2]); - uint8x8_t mask = vshrn_n_u16(tmp, 8); // 00 or FF - vst1_u8(tmpb, s_argb8.val[2]); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[0]], tmp, 0); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[1]], tmp, 1); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[2]], tmp, 2); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[3]], tmp, 3); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[4]], tmp, 4); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[5]], tmp, 5); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[6]], tmp, 6); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[7]], tmp, 7); -// for(int i = 0; i < 8; ++i) { -// tmpa[i] = TVPRecipTableForOpacityOnOpacity[tmpb[i]]; -// } -// tmp = vld1q_u16(tmpa); - tmp = vmulq_u16(vmovl_u8(d_argb8.val[2]), tmp); - d_argb8.val[2] = vorr_u8(vshrn_n_u16(tmp, 8), mask); - - tmp = vsubl_u8(s_argb8.val[1], d_argb8.val[1]); - mask = vshrn_n_u16(tmp, 8); - vst1_u8(tmpb, s_argb8.val[1]); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[0]], tmp, 0); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[1]], tmp, 1); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[2]], tmp, 2); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[3]], tmp, 3); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[4]], tmp, 4); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[5]], tmp, 5); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[6]], tmp, 6); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[7]], tmp, 7); -// for(int i = 0; i < 8; ++i) { -// tmpa[i] = TVPRecipTableForOpacityOnOpacity[tmpb[i]]; -// } -// tmp = vld1q_u16(tmpa); - tmp = vmulq_u16(vmovl_u8(d_argb8.val[1]), tmp); - d_argb8.val[1] = vorr_u8(vshrn_n_u16(tmp, 8), mask); - - tmp = vsubl_u8(s_argb8.val[0], d_argb8.val[0]); - mask = vshrn_n_u16(tmp, 8); - vst1_u8(tmpb, s_argb8.val[0]); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[0]], tmp, 0); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[1]], tmp, 1); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[2]], tmp, 2); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[3]], tmp, 3); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[4]], tmp, 4); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[5]], tmp, 5); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[6]], tmp, 6); - tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[7]], tmp, 7); -// for(int i = 0; i < 8; ++i) { -// tmpa[i] = TVPRecipTableForOpacityOnOpacity[tmpb[i]]; -// } -// tmp = vld1q_u16(tmpa); - tmp = vmulq_u16(vmovl_u8(d_argb8.val[0]), tmp); - d_argb8.val[0] = vorr_u8(vshrn_n_u16(tmp, 8), mask); - - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); - dest += 8; - src += 8; - } - - if(dest < pEndDst) { - TVPColorDodgeBlend_HDA_c(dest, src, pEndDst - dest); - } -} - -static void TVPColorDodgeBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPColorDodgeBlend_HDA_o_c(dest, src, PreFragLen, opa); - dest += PreFragLen; - src += PreFragLen; - } - } - - unsigned char tmpbuff[16 + 16 + 8]; - unsigned short *tmpa = __builtin_assume_aligned((unsigned short*)((((intptr_t)tmpbuff) + 7) & ~7), 8); - unsigned char* tmpb = __builtin_assume_aligned((unsigned char*)(tmpa + 8), 8); - - uint8x8_t opa8 = vdup_n_u8(opa); - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - while(dest < pVecEndDst) { - uint8x8x4_t s_argb8 = vld4_u8((unsigned char*)src); - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - - // d = d * 255 / (255 - s * opa / 256) - uint16x8_t s_r16 = vmull_u8(s_argb8.val[2], opa8); - uint16x8_t s_g16 = vmull_u8(s_argb8.val[1], opa8); - uint16x8_t s_b16 = vmull_u8(s_argb8.val[0], opa8); - - s_argb8.val[2] = vmvn_u8(vshrn_n_u16(s_r16, 8)); - s_argb8.val[1] = vmvn_u8(vshrn_n_u16(s_g16, 8)); - s_argb8.val[0] = vmvn_u8(vshrn_n_u16(s_b16, 8)); - - uint16x8_t tmp = vsubl_u8(s_argb8.val[2], d_argb8.val[2]); - uint8x8_t mask = vshrn_n_u16(tmp, 8); - vst1_u8(tmpb, s_argb8.val[2]); - for(int i = 0; i < 8; ++i) { - tmpa[i] = TVPRecipTableForOpacityOnOpacity[tmpb[i]]; - } - tmp = vld1q_u16(tmpa); - tmp = vmulq_u16(vmovl_u8(d_argb8.val[2]), tmp); - d_argb8.val[2] = vorr_u8(vshrn_n_u16(tmp, 8), mask); - - tmp = vsubl_u8(s_argb8.val[1], d_argb8.val[1]); - mask = vshrn_n_u16(tmp, 8); - vst1_u8(tmpb, s_argb8.val[1]); - for(int i = 0; i < 8; ++i) { - tmpa[i] = TVPRecipTableForOpacityOnOpacity[tmpb[i]]; - } - tmp = vld1q_u16(tmpa); - tmp = vmulq_u16(vmovl_u8(d_argb8.val[1]), tmp); - d_argb8.val[1] = vorr_u8(vshrn_n_u16(tmp, 8), mask); - - tmp = vsubl_u8(s_argb8.val[0], d_argb8.val[0]); - mask = vshrn_n_u16(tmp, 8); - vst1_u8(tmpb, s_argb8.val[0]); - for(int i = 0; i < 8; ++i) { - tmpa[i] = TVPRecipTableForOpacityOnOpacity[tmpb[i]]; - } - tmp = vld1q_u16(tmpa); - tmp = vmulq_u16(vmovl_u8(d_argb8.val[0]), tmp); - d_argb8.val[0] = vorr_u8(vshrn_n_u16(tmp, 8), mask); - - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); - dest += 8; - src += 8; - } - - if(dest < pEndDst) { - TVPColorDodgeBlend_HDA_o_c(dest, src, pEndDst - dest, opa); - } -} - -static void TVPDarkenBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPDarkenBlend_HDA_c(dest, src, PreFragLen); - dest += PreFragLen; - src += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - if (((intptr_t)src) & 7) { - while (dest < pVecEndDst) { - uint8x16x4_t s = vld4q_u8(__builtin_assume_aligned((uint8_t *)src, 4)); - uint8x16x4_t d = vld4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - d.val[0] = vminq_u8(s.val[0], d.val[0]); - d.val[1] = vminq_u8(s.val[1], d.val[1]); - d.val[2] = vminq_u8(s.val[2], d.val[2]); - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - src += 16; - dest += 16; - } - } else { - while (dest < pVecEndDst) { - uint8x16x4_t s = vld4q_u8(__builtin_assume_aligned((uint8_t *)src, 8)); - uint8x16x4_t d = vld4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - d.val[0] = vminq_u8(s.val[0], d.val[0]); - d.val[1] = vminq_u8(s.val[1], d.val[1]); - d.val[2] = vminq_u8(s.val[2], d.val[2]); - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - src += 16; - dest += 16; - } - } - - if(dest < pEndDst) { - TVPDarkenBlend_HDA_c(dest, src, pEndDst - dest); - } -} - -static void TVPDarkenBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPDarkenBlend_HDA_o_c(dest, src, PreFragLen, opa); - dest += PreFragLen; - src += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - uint16x8_t opa16 = vdupq_n_u16(opa); - uint8x8_t revopa8 = vdup_n_u8(~opa); - while(dest < pVecEndDst) { - uint8x8x4_t s_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)src, 4)); - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - - s_argb8.val[2] = vmin_u8(s_argb8.val[2], d_argb8.val[2]); - s_argb8.val[1] = vmin_u8(s_argb8.val[1], d_argb8.val[1]); - s_argb8.val[0] = vmin_u8(s_argb8.val[0], d_argb8.val[0]); - - // d + (s - d) * o - uint16x8_t d_r16 = vmulq_u16(vsubl_u8(s_argb8.val[2], d_argb8.val[2]), opa16); - uint16x8_t d_g16 = vmulq_u16(vsubl_u8(s_argb8.val[1], d_argb8.val[1]), opa16); - uint16x8_t d_b16 = vmulq_u16(vsubl_u8(s_argb8.val[0], d_argb8.val[0]), opa16); - - d_argb8.val[2] = vadd_u8(d_argb8.val[2], vshrn_n_u16(d_r16, 8)); - d_argb8.val[1] = vadd_u8(d_argb8.val[1], vshrn_n_u16(d_g16, 8)); - d_argb8.val[0] = vadd_u8(d_argb8.val[0], vshrn_n_u16(d_b16, 8)); - - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); - dest += 8; - src += 8; - } - - if(dest < pEndDst) { - TVPDarkenBlend_HDA_o_c(dest, src, pEndDst - dest, opa); - } -} - -static void TVPLightenBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPLightenBlend_HDA_c(dest, src, PreFragLen); - dest += PreFragLen; - src += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-15; - if (((intptr_t)src) & 7) { - while (dest < pVecEndDst) { - uint8x16x4_t s = vld4q_u8(__builtin_assume_aligned((uint8_t *)src, 4)); - uint8x16x4_t d = vld4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - d.val[0] = vmaxq_u8(s.val[0], d.val[0]); - d.val[1] = vmaxq_u8(s.val[1], d.val[1]); - d.val[2] = vmaxq_u8(s.val[2], d.val[2]); - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - src += 16; - dest += 16; - } - } else { - while (dest < pVecEndDst) { - uint8x16x4_t s = vld4q_u8(__builtin_assume_aligned((uint8_t *)src, 8)); - uint8x16x4_t d = vld4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - d.val[0] = vmaxq_u8(s.val[0], d.val[0]); - d.val[1] = vmaxq_u8(s.val[1], d.val[1]); - d.val[2] = vmaxq_u8(s.val[2], d.val[2]); - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - src += 16; - dest += 16; - } - } - - if(dest < pEndDst) { - TVPLightenBlend_HDA_c(dest, src, pEndDst - dest); - } -} - -static void TVPLightenBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPLightenBlend_HDA_o_c(dest, src, PreFragLen, opa); - dest += PreFragLen; - src += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - uint16x8_t opa16 = vdupq_n_u16(opa); - while(dest < pVecEndDst) { - uint8x8x4_t s_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)src, 4)); - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - - s_argb8.val[2] = vmax_u8(s_argb8.val[2], d_argb8.val[2]); - s_argb8.val[1] = vmax_u8(s_argb8.val[1], d_argb8.val[1]); - s_argb8.val[0] = vmax_u8(s_argb8.val[0], d_argb8.val[0]); - - // d + (s - d) * o - uint16x8_t d_r16 = vmulq_u16(vsubl_u8(s_argb8.val[2], d_argb8.val[2]), opa16); - uint16x8_t d_g16 = vmulq_u16(vsubl_u8(s_argb8.val[1], d_argb8.val[1]), opa16); - uint16x8_t d_b16 = vmulq_u16(vsubl_u8(s_argb8.val[0], d_argb8.val[0]), opa16); - - d_argb8.val[2] = vadd_u8(d_argb8.val[2], vshrn_n_u16(d_r16, 8)); - d_argb8.val[1] = vadd_u8(d_argb8.val[1], vshrn_n_u16(d_g16, 8)); - d_argb8.val[0] = vadd_u8(d_argb8.val[0], vshrn_n_u16(d_b16, 8)); - - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); - dest += 8; - src += 8; - } - - if(dest < pEndDst) { - TVPLightenBlend_HDA_o_c(dest, src, pEndDst - dest, opa); - } -} - -static void TVPScreenBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPScreenBlend_HDA_c(dest, src, PreFragLen); - dest += PreFragLen; - src += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - while(dest < pVecEndDst) { - uint8x8x4_t s_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)src, 4)); - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - - uint16x8_t d_r16 = vmull_u8(vmvn_u8(s_argb8.val[2]), vmvn_u8(d_argb8.val[2])); - uint16x8_t d_g16 = vmull_u8(vmvn_u8(s_argb8.val[1]), vmvn_u8(d_argb8.val[1])); - uint16x8_t d_b16 = vmull_u8(vmvn_u8(s_argb8.val[0]), vmvn_u8(d_argb8.val[0])); - d_argb8.val[2] = vmvn_u8(vshrn_n_u16(d_r16, 8)); - d_argb8.val[1] = vmvn_u8(vshrn_n_u16(d_g16, 8)); - d_argb8.val[0] = vmvn_u8(vshrn_n_u16(d_b16, 8)); - - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); - dest += 8; - src += 8; - } - - if(dest < pEndDst) { - TVPScreenBlend_HDA_c(dest, src, pEndDst - dest); - } -} - -static void TVPScreenBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPScreenBlend_HDA_o_c(dest, src, PreFragLen, opa); - dest += PreFragLen; - src += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - uint8x8_t opa8 = vdup_n_u8(opa); - while(dest < pVecEndDst) { - uint8x8x4_t s_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)src, 4)); - - uint16x8_t s_r16 = vmull_u8(s_argb8.val[2], opa8); - uint16x8_t s_g16 = vmull_u8(s_argb8.val[1], opa8); - uint16x8_t s_b16 = vmull_u8(s_argb8.val[0], opa8); - uint8x8x4_t d_argb8 = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - s_argb8.val[2] = vshrn_n_u16(s_r16, 8); - s_argb8.val[1] = vshrn_n_u16(s_g16, 8); - s_argb8.val[0] = vshrn_n_u16(s_b16, 8); - - uint16x8_t d_r16 = vmull_u8(vmvn_u8(s_argb8.val[2]), vmvn_u8(d_argb8.val[2])); - uint16x8_t d_g16 = vmull_u8(vmvn_u8(s_argb8.val[1]), vmvn_u8(d_argb8.val[1])); - uint16x8_t d_b16 = vmull_u8(vmvn_u8(s_argb8.val[0]), vmvn_u8(d_argb8.val[0])); - d_argb8.val[2] = vmvn_u8(vshrn_n_u16(d_r16, 8)); - d_argb8.val[1] = vmvn_u8(vshrn_n_u16(d_g16, 8)); - d_argb8.val[0] = vmvn_u8(vshrn_n_u16(d_b16, 8)); - - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d_argb8); - dest += 8; - src += 8; - } - - if(dest < pEndDst) { - TVPScreenBlend_HDA_o_c(dest, src, pEndDst - dest, opa); - } -} - -#define STRECH_FUNC -#define COPY_FUNC -#define FUNC_NAME TVPInterpStretchCopy -#define C_FUNC_NAME TVPInterpStretchCopy_c -#include "InterpTransBlend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef COPY_FUNC -#undef STRECH_FUNC - -#define LINEAR_TRANS_FUNC -#define COPY_FUNC -#define FUNC_NAME TVPInterpLinTransCopy -#define C_FUNC_NAME TVPInterpLinTransCopy_c -#include "InterpTransBlend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef COPY_FUNC -#undef LINEAR_TRANS_FUNC - -static void TVPFastLinearInterpV2_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src0, const tjs_uint32 *src1) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPFastLinearInterpV2_c(dest, PreFragLen, src0, src1); - dest += PreFragLen; - src0 += PreFragLen; - src1 += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-3; - if ((((intptr_t)src0) & 7) && (((intptr_t)src1) & 7)) { - while (dest < pVecEndDst) { - uint8x16_t s0 = vld1q_u8(__builtin_assume_aligned((uint8_t *)src0, 8)); - uint8x16_t s1 = vld1q_u8(__builtin_assume_aligned((uint8_t *)src1, 8)); - - vst1q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), vhaddq_u8(s0, s1)); - dest += 4; - src0 += 4; - src1 += 4; - } - } else { - while (dest < pVecEndDst) { - uint8x16_t s0 = vld1q_u8(__builtin_assume_aligned((uint8_t *)src0, 4)); - uint8x16_t s1 = vld1q_u8(__builtin_assume_aligned((uint8_t *)src1, 4)); - - vst1q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), vhaddq_u8(s0, s1)); - dest += 4; - src0 += 4; - src1 += 4; - } - } - - if(dest < pEndDst) { - TVPFastLinearInterpV2_c(dest, pEndDst - dest, src0, src1); - } -} - -static void TVPCopyMask_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPCopyMask_c(dest, src, PreFragLen); - dest += PreFragLen; - src += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - if (((intptr_t)src) & 7) { - while (dest < pVecEndDst) { -#if 1 - //__builtin_prefetch(src, 0, 0); - uint8x8x4_t s = vld4_u8(__builtin_assume_aligned((uint8_t *)src, 4)); - uint8x8x4_t d = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - d.val[3] = s.val[3]; - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - src += 8; - dest += 8; -#else - __asm__ __volatile__( - "vld4.u8 {d0,d1,d2,d3}, [%1:128] \n\t" // d - "vld4.u8 {d3,d4,d5,d6}, [%0]! \n\t" // s - "vmov.u8 d3, d6 \n\t" - "vst4.u8 {d0,d1,d2,d3}, [%1:128]! \n\t" // d - : - : "rw"(src), "rw"(dest) - ); -#endif - } - } else { - while (dest < pVecEndDst) { -#if 1 - //__builtin_prefetch(src, 0, 0); - uint8x8x4_t s = vld4_u8(__builtin_assume_aligned((uint8_t *)src, 8)); - uint8x8x4_t d = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - d.val[3] = s.val[3]; - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - src += 8; - dest += 8; -#else - __asm__ __volatile__( - "vld4.u8 {d0,d1,d2,d3}, [%1:128] \n\t" // d - "vld4.u8 {d3,d4,d5,d6}, [%0:128]! \n\t" // s - "vmov.u8 d3, d6 \n\t" - "vst4.u8 {d0,d1,d2,d3}, [%1:128]! \n\t" // d - : - : "rw"(src), "rw"(dest) - ); -#endif - } - } - - if(dest < pEndDst) { - TVPCopyMask_c(dest, src, pEndDst - dest); - } -} - -static void TVPCopyColor_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPCopyColor_c(dest, src, PreFragLen); - dest += PreFragLen; - src += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - if (((intptr_t)src) & 7) { - while (dest < pVecEndDst) { - //__builtin_prefetch(src, 0, 0); - uint8x8x4_t s = vld4_u8(__builtin_assume_aligned((uint8_t *)src, 4)); - uint8x8x4_t d = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - s.val[3] = d.val[3]; - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), s); - src += 8; - dest += 8; - } - } else { - while (dest < pVecEndDst) { - //__builtin_prefetch(src, 0, 0); - uint8x8x4_t s = vld4_u8(__builtin_assume_aligned((uint8_t *)src, 8)); - uint8x8x4_t d = vld4_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - s.val[3] = d.val[3]; - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), s); - src += 8; - dest += 8; - } - } - - if(dest < pEndDst) { - TVPCopyColor_c(dest, src, pEndDst - dest); - } -} - -static void TVPBindMaskToMain_NEON(tjs_uint32 *main, const tjs_uint8 *mask, tjs_int len) -{ - tjs_uint32* pEndDst = main + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)main) + 7)&~7) - main; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPBindMaskToMain_c(main, mask, PreFragLen); - main += PreFragLen; - mask += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-15; - if (((intptr_t)mask) & 7) { - while (main < pVecEndDst) { - __builtin_prefetch(mask, 0, 0); - uint8x16x4_t s = vld4q_u8(__builtin_assume_aligned((uint8_t *)main, 8)); - s.val[3] = vld1q_u8(mask); - vst4q_u8(__builtin_assume_aligned((uint8_t *)main, 8), s); - main += 16; - mask += 16; - } - } else { - while (main < pVecEndDst) { - __builtin_prefetch(mask, 0, 0); - uint8x16x4_t s = vld4q_u8(__builtin_assume_aligned((uint8_t *)main, 8)); - s.val[3] = vld1q_u8(__builtin_assume_aligned((uint8_t *)mask, 8)); - vst4q_u8(__builtin_assume_aligned((uint8_t *)main, 8), s); - main += 16; - mask += 16; - } - } - - if(main < pEndDst) { - TVPBindMaskToMain_c(main, mask, pEndDst - main); - } -} - -static void TVPFillARGB_NEON(tjs_uint32 *dest, tjs_int len, tjs_uint32 value) -{ - tjs_uint32* pEndDst = dest + len; - while((((intptr_t)dest)&~7) && dest < pEndDst) { - *dest++ = value; - } - - uint32x4_t v = vdupq_n_u32(value); - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-3; - while(dest < pVecEndDst) { - vst1q_u32(__builtin_assume_aligned(dest, 8), v); - dest += 4; - } - while(dest < pEndDst) { - *dest++ = value; - } -} - -static void TVPFillColor_NEON(tjs_uint32 *dest, tjs_int len, tjs_uint32 color) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPFillColor_c(dest, PreFragLen, color); - dest += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-15; - uint8x16x4_t s; - s.val[0] = vdupq_n_u8(color & 0xff); - s.val[1] = vdupq_n_u8((color >> 8) & 0xff); - s.val[2] = vdupq_n_u8((color >> 16) & 0xff); - while(dest < pVecEndDst) { - uint8x16x4_t d = vld4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - s.val[3] = d.val[3]; - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), s); - dest += 16; - } - - if(dest < pEndDst) { - TVPFillColor_c(dest, pEndDst - dest, color); - } -} - -static void TVPFillMask_NEON(tjs_uint32 *dest, tjs_int len, tjs_uint32 mask) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPFillMask_c(dest, PreFragLen, mask); - dest += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-15; - while(dest < pVecEndDst) { - uint8x16x4_t s = vld4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8)); - s.val[3] = vdupq_n_u8(mask); - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), s); - dest += 16; - } - - if(dest < pEndDst) { - TVPFillMask_c(dest, pEndDst - dest, mask); - } -} - -static void TVPAddSubVertSum16_NEON(tjs_uint16 *dest, const tjs_uint32 *addline, const tjs_uint32 *subline, tjs_int len) -{ - tjs_uint16* pEndDst = dest + len * 4; - { - tjs_int PreFragLen = ((tjs_uint16*)((((intptr_t)dest) + 7)&~7) - dest) / 4; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPAddSubVertSum16_c(dest, addline, subline, PreFragLen); - dest += PreFragLen * 4; - addline += PreFragLen; - subline += PreFragLen; - } - } - - tjs_uint16* pVecEndDst = (tjs_uint16*)(((intptr_t)pEndDst)&~7)-7; - if ((((intptr_t)addline) & 7) && (((intptr_t)subline) & 7)) { - while (dest < pVecEndDst) { - uint8x8x4_t add = vld4_u8((unsigned char *)addline); - uint8x8x4_t sub = vld4_u8((unsigned char *)subline); - uint16x8x4_t d = vld4q_u16(__builtin_assume_aligned(dest, 8)); - d.val[3] = vaddq_u16(d.val[3], vsubl_u8(add.val[3], sub.val[3])); - d.val[2] = vaddq_u16(d.val[2], vsubl_u8(add.val[2], sub.val[2])); - d.val[1] = vaddq_u16(d.val[1], vsubl_u8(add.val[1], sub.val[1])); - d.val[0] = vaddq_u16(d.val[0], vsubl_u8(add.val[0], sub.val[0])); - vst4q_u16(__builtin_assume_aligned(dest, 8), d); - dest += 8 * 4; - addline += 8; - subline += 8; - } - } else { - while (dest < pVecEndDst) { - uint8x8x4_t add = vld4_u8(__builtin_assume_aligned((uint8_t *)addline, 8)); - uint8x8x4_t sub = vld4_u8(__builtin_assume_aligned((uint8_t *)subline, 8)); - uint16x8x4_t d = vld4q_u16(__builtin_assume_aligned(dest, 8)); - d.val[3] = vaddq_u16(d.val[3], vsubl_u8(add.val[3], sub.val[3])); - d.val[2] = vaddq_u16(d.val[2], vsubl_u8(add.val[2], sub.val[2])); - d.val[1] = vaddq_u16(d.val[1], vsubl_u8(add.val[1], sub.val[1])); - d.val[0] = vaddq_u16(d.val[0], vsubl_u8(add.val[0], sub.val[0])); - vst4q_u16(__builtin_assume_aligned(dest, 8), d); - dest += 8 * 4; - addline += 8; - subline += 8; - } - } - - if(dest < pEndDst) { - TVPAddSubVertSum16_c(dest, addline, subline, (pEndDst - dest) / 4); - } -} - -static void TVPAddSubVertSum16_d_NEON(tjs_uint16 *dest, const tjs_uint32 *addline, const tjs_uint32 *subline, tjs_int len) -{ - tjs_uint16* pEndDst = dest + len * 4; - { - tjs_int PreFragLen = ((tjs_uint16*)((((intptr_t)dest) + 7)&~7) - dest) / 4; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPAddSubVertSum16_d_c(dest, addline, subline, PreFragLen); - dest += PreFragLen * 4; - addline += PreFragLen; - subline += PreFragLen; - } - } - - tjs_uint16* pVecEndDst = (tjs_uint16*)(((intptr_t)pEndDst)&~7)-7; - while(dest < pVecEndDst) { - uint8x8x4_t add = vld4_u8((unsigned char *)addline); - uint8x8x4_t sub = vld4_u8((unsigned char *)subline); - uint16x8x4_t d = vld4q_u16(__builtin_assume_aligned(dest, 8)); - - uint16x8_t add_a = vaddl_u8(add.val[3], vshr_n_u8(add.val[3], 7)); - uint16x8_t sub_a = vaddl_u8(sub.val[3], vshr_n_u8(sub.val[3], 7)); - d.val[3] = vaddq_u16(d.val[3], vsubl_u8(add.val[3], sub.val[3])); - - uint16x8_t add_16 = vmulq_u16(vmovl_u8(add.val[2]), add_a); - uint16x8_t sub_16 = vmulq_u16(vmovl_u8(sub.val[2]), sub_a); - add_16 = vshrq_n_u16(add_16, 8); - sub_16 = vshrq_n_u16(sub_16, 8); - d.val[2] = vaddq_u16(d.val[2], vsubq_u16(add_16, sub_16)); - - add_16 = vmulq_u16(vmovl_u8(add.val[1]), add_a); - sub_16 = vmulq_u16(vmovl_u8(sub.val[1]), sub_a); - add_16 = vshrq_n_u16(add_16, 8); - sub_16 = vshrq_n_u16(sub_16, 8); - d.val[1] = vaddq_u16(d.val[1], vsubq_u16(add_16, sub_16)); - - add_16 = vmulq_u16(vmovl_u8(add.val[0]), add_a); - sub_16 = vmulq_u16(vmovl_u8(sub.val[0]), sub_a); - add_16 = vshrq_n_u16(add_16, 8); - sub_16 = vshrq_n_u16(sub_16, 8); - d.val[0] = vaddq_u16(d.val[0], vsubq_u16(add_16, sub_16)); - - vst4q_u16(__builtin_assume_aligned(dest, 8), d); - dest += 8 * 4; - addline += 8; - subline += 8; - } - - if(dest < pEndDst) { - TVPAddSubVertSum16_d_c(dest, addline, subline, (pEndDst - dest) / 4); - } -} - -static void TVPDoBoxBlurAvg16_NEON(tjs_uint32 *dest, tjs_uint16 *_sum, const tjs_uint16 * add, const tjs_uint16 * sub, tjs_int n, tjs_int len) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPDoBoxBlurAvg16_c(dest, _sum, add, sub, n, PreFragLen); - dest += PreFragLen; - add += PreFragLen; - sub += PreFragLen; - } - } - - static const int32_t c_shl_n[4] = { 0, 8, 16, 24 }; - - uint32x4_t rcp = vdupq_n_u32((1<<16) / n); - int32x4_t shl_n = vld1q_s32(c_shl_n); - uint16x4_t half_n = vdup_n_u16(n >> 1); - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-7; - uint16x4_t sum = vld1_u16(_sum); - while (dest < pVecEndDst) { - uint32x4_t t = vmulq_u32(vaddl_u16(sum, half_n), rcp); - uint32x4_t d = vshlq_u32(vshrq_n_u32(t, 16), shl_n); - -// t0 = vmul_u32(vaddl_u16(src_sum.val[2], half_n)); -// vorr_u32(d, vshl_n_u32(vshr_n_u32(t1, 16), 8)); -// t1 = vmul_u32(vaddl_u16(src_sum.val[3], half_n)); -// vorr_u32(d, vshl_n_u32(vshr_n_u32(t0, 16), 8)); -// vorr_u32(d, vshl_n_u32(vshr_n_u32(t1, 16), 8)); -// -// uint16x4x4_t src_add = vld4_u16 - -// uint16x8_t add = vld1q_u16(add); -// uint16x8_t sub = vld1q_u16(sub); -// uint16x8_t d = vld4q_u16(dest); -// d.val[3] = vaddq_u16(d.val[3], vsubl_u8(add.val[3], sub.val[3])); -// d.val[2] = vaddq_u16(d.val[2], vsubl_u8(add.val[2], sub.val[2])); -// d.val[1] = vaddq_u16(d.val[1], vsubl_u8(add.val[1], sub.val[1])); -// d.val[0] = vaddq_u16(d.val[0], vsubl_u8(add.val[0], sub.val[0])); - vst1q_u32(dest, d); - dest += 8; - add += 8; - sub += 8; - } - - if(dest < pEndDst) { - TVPDoBoxBlurAvg16_c(dest, _sum, add, sub, n, pEndDst - dest); - } -} - -static void TVPExpand8BitTo32BitGray_NEON(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPExpand8BitTo32BitGray_c(dest, src, PreFragLen); - dest += PreFragLen; - src += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-15; - uint8x16x4_t d; - d.val[3] = vdupq_n_u8(0xFF); - if (((intptr_t)src) & 7) { - while (dest < pVecEndDst) { - d.val[2] = vld1q_u8(src); - d.val[1] = d.val[2]; - d.val[0] = d.val[2]; - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - dest += 16; - src += 16; - } - } else { - while (dest < pVecEndDst) { - d.val[2] = vld1q_u8(__builtin_assume_aligned((uint8_t *)src, 8)); - d.val[1] = d.val[2]; - d.val[0] = d.val[2]; - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - dest += 16; - src += 16; - } - } - - if(dest < pEndDst) { - TVPExpand8BitTo32BitGray_c(dest, src, pEndDst - dest); - } -} - -static void TVPReverseRGB_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPReverseRGB_c(dest, src, PreFragLen); - dest += PreFragLen; - src += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-15; - if (((intptr_t)src) & 7) { - while (dest < pVecEndDst) { - uint8x16x4_t d = vld4q_u8((uint8_t*)src); - uint8x16_t t = d.val[0]; - d.val[0] = d.val[2]; - d.val[2] = t; - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - dest += 16; - src += 16; - } - } else { - while (dest < pVecEndDst) { - uint8x16x4_t d = vld4q_u8(__builtin_assume_aligned((uint8_t *)src, 8)); - uint8x16_t t = d.val[0]; - d.val[0] = d.val[2]; - d.val[2] = t; - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - dest += 16; - src += 16; - } - } - - if(dest < pEndDst) { - TVPReverseRGB_c(dest, src, pEndDst - dest); - } -} - -static void TVPUpscale65_255_NEON(tjs_uint8 *dest, tjs_int len) { - // dest is already aligned by 16 bytes - tjs_uint8* pEndDst = dest + len; - tjs_uint8* pVecEndDst = (tjs_uint8*)(((intptr_t)pEndDst)&~7) - 15; - - while (dest < pVecEndDst) { - uint8x16_t d = vld1q_u8(__builtin_assume_aligned((uint8_t *)dest, 16)); - d = vqshlq_n_u8(d, 2); - vst1q_u8(__builtin_assume_aligned((uint8_t *)dest, 16), d); - dest += 16; - } - while (dest < pEndDst) { - tjs_uint c = *dest << 2; - *dest = c > 255 ? 255 : c; - ++dest; - } -} - -static const unsigned char rgb555_lut[4][8] = { - { 0, 0x8, 0x10, 0x18, 0x21, 0x29, 0x31, 0x39 }, - { 0x42, 0x4A, 0x52, 0x5A, 0x63, 0x6B, 0x73, 0x7B }, - { 0x84, 0x8C, 0x94, 0x9C, 0xA5, 0xAD, 0xB5, 0xBD }, - { 0xC6, 0xCE, 0xD6, 0xDE, 0xE7, 0xEF, 0xF7, 0xFF } }; - -static void TVPBLConvert15BitTo32Bit_NEON(tjs_uint32 *dest, const tjs_uint16 *src, tjs_int len) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if (PreFragLen > len) PreFragLen = len; - if (PreFragLen) { - TVPBLConvert15BitTo32Bit_c(dest, src, PreFragLen); - dest += PreFragLen; - src += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7) - 7; - - if (dest < pVecEndDst) - { - uint8x8x4_t d; - d.val[3] = vdup_n_u8(0xFF); -#if 0 //def __LP64__ - uint8x16x2_t lut; - lut.val[0] = vld1q_u8(rgb555_lut[0]); - lut.val[1] = vld1q_u8(rgb555_lut[2]); - - while (dest < pVecEndDst) { - uint16x8_t s = vshlq_n_u16(vld1q_u16(src), 1); - d.val[0] = vtbl2q_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); - s = vshlq_n_u16(s, 5); - d.val[1] = vtbl2q_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); - s = vshlq_n_u16(s, 5); - d.val[2] = vtbl2q_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); - vst4_u8((uint8_t*)dest, d); - dest += 8; - src += 8; - } -#else - uint8x8x4_t lut; - lut.val[0] = vld1_u8(rgb555_lut[0]); - lut.val[1] = vld1_u8(rgb555_lut[1]); - lut.val[2] = vld1_u8(rgb555_lut[2]); - lut.val[3] = vld1_u8(rgb555_lut[3]); - - if (((intptr_t)src) & 7) { - while (dest < pVecEndDst) { - uint16x8_t s = vshlq_n_u16(vld1q_u16(src), 1); - d.val[0] = vtbl4_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); - s = vshlq_n_u16(s, 5); - d.val[1] = vtbl4_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); - s = vshlq_n_u16(s, 5); - d.val[2] = vtbl4_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - dest += 8; - src += 8; - } - } else { - while (dest < pVecEndDst) { - uint16x8_t s = vshlq_n_u16(vld1q_u16(__builtin_assume_aligned(src, 8)), 1); - d.val[0] = vtbl4_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); - s = vshlq_n_u16(s, 5); - d.val[1] = vtbl4_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); - s = vshlq_n_u16(s, 5); - d.val[2] = vtbl4_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - dest += 8; - src += 8; - } - } -#endif - } - - if (dest < pEndDst) { - TVPBLConvert15BitTo32Bit_c(dest, src, pEndDst - dest); - } -} - -static void TVPConvert24BitTo32Bit_NEON(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len) -{ - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPConvert24BitTo32Bit_c(dest, src, PreFragLen); - dest += PreFragLen; - src += PreFragLen * 3; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-15; - uint8x16x4_t d; - d.val[3] = vdupq_n_u8(0xFF); - if (((intptr_t)src) & 7) { - while (dest < pVecEndDst) { - uint8x16x3_t s = vld3q_u8(src); - d.val[2] = s.val[0]; - d.val[1] = s.val[1]; - d.val[0] = s.val[2]; - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - dest += 16; - src += 16 * 3; - } - } else { - while (dest < pVecEndDst) { - uint8x16x3_t s = vld3q_u8(__builtin_assume_aligned(src, 8)); - d.val[2] = s.val[0]; - d.val[1] = s.val[1]; - d.val[0] = s.val[2]; - vst4q_u8(__builtin_assume_aligned((uint8_t *)dest, 8), d); - dest += 16; - src += 16 * 3; - } - } - - if(dest < pEndDst) { - TVPConvert24BitTo32Bit_c(dest, src, pEndDst - dest); - } -} - -static void TVPConvert32BitTo24Bit_NEON(tjs_uint8 *dest, const tjs_uint8 *src, tjs_int len) { - const tjs_uint8* pEndSrc = src + len; - { - tjs_int PreFragLen = (const tjs_uint8*)((((intptr_t)src) + 7)&~7) - src; - if (PreFragLen > len) PreFragLen = len; - const tjs_uint8 *pend = src + PreFragLen; // in bytes - while (src < pend) - { - dest[0] = src[0]; - dest[1] = src[1]; - dest[2] = src[2]; - dest += 3; - src += 4; - } - } - - const tjs_uint8* pVecEndSrc = (const tjs_uint8*)(((intptr_t)pEndSrc)&~7) - 15; - uint8x16x3_t d; - if (((intptr_t)dest) & 7) { - while (src < pVecEndSrc) { - uint8x16x4_t s = vld4q_u8(__builtin_assume_aligned(src, 8)); - d.val[0] = s.val[0]; - d.val[1] = s.val[1]; - d.val[2] = s.val[2]; - vst3q_u8(dest, d); - dest += 16 * 3; - src += 16 * 4; - } - } else { - while (src < pVecEndSrc) { - uint8x16x4_t s = vld4q_u8(__builtin_assume_aligned(src, 8)); - d.val[0] = s.val[0]; - d.val[1] = s.val[1]; - d.val[2] = s.val[2]; - vst3q_u8(__builtin_assume_aligned(dest, 8), d); - dest += 16 * 3; - src += 16 * 4; - } - } - - while (src < pEndSrc) { - dest[0] = src[0]; - dest[1] = src[1]; - dest[2] = src[2]; - dest += 3; - src += 4; - } -} - -static void TVPDoGrayScale_NEON(tjs_uint32 *dest, tjs_int len) { - tjs_uint32* pEndDst = dest + len; - { - tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; - if(PreFragLen > len) PreFragLen = len; - if(PreFragLen) { - TVPDoGrayScale_c(dest, PreFragLen); - dest += PreFragLen; - } - } - - tjs_uint32* pVecEndDst = (tjs_uint32*)(((intptr_t)pEndDst)&~7)-15; - uint8x8_t const_19 = vdup_n_u8(19), const_183 = vdup_n_u8(183), const_54 = vdup_n_u8(54); - while(dest < pVecEndDst) { - uint8x8x4_t s = vld4_u8((uint8_t*)dest); - uint16x8_t r = vmull_u8(s.val[0], const_19); - uint16x8_t g = vmull_u8(s.val[1], const_183); - uint16x8_t b = vmull_u8(s.val[2], const_54); - r = vaddq_u16(r, g); - r = vaddq_u16(r, b); - s.val[2] = s.val[1] = s.val[0] = vshrn_n_u16(r, 8); - vst4_u8(__builtin_assume_aligned((uint8_t *)dest, 8), s); - dest += 8; - } - - if(dest < pEndDst) { - TVPDoGrayScale_c(dest, pEndDst - dest); - } -} - -#ifndef Region_PSBlend -#define TVPPS_OPERATION "ps_alphablend.h" -#define FUNC_NAME TVPPsAlphaBlend -#define C_FUNC_NAME TVPPsAlphaBlend_HDA_c -#include "psblend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPPsAlphaBlend_o -#define C_FUNC_NAME TVPPsAlphaBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "psblend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef TVPPS_OPERATION - -#define TVPPS_OPERATION "ps_addblend.h" -#define FUNC_NAME TVPPsAddBlend -#define C_FUNC_NAME TVPPsAddBlend_HDA_c -#include "psblend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPPsAddBlend_o -#define C_FUNC_NAME TVPPsAddBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "psblend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef TVPPS_OPERATION - -#define TVPPS_OPERATION "ps_subblend.h" -#define FUNC_NAME TVPPsSubBlend -#define C_FUNC_NAME TVPPsSubBlend_HDA_c -#include "psblend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPPsSubBlend_o -#define C_FUNC_NAME TVPPsSubBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "psblend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef TVPPS_OPERATION - -#define TVPPS_OPERATION "ps_mulblend.h" -#define FUNC_NAME TVPPsMulBlend -#define C_FUNC_NAME TVPPsMulBlend_HDA_c -#include "psblend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPPsMulBlend_o -#define C_FUNC_NAME TVPPsMulBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "psblend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef TVPPS_OPERATION - -#define TVPPS_OPERATION "ps_screenblend.h" -#define FUNC_NAME TVPPsScreenBlend -#define C_FUNC_NAME TVPPsScreenBlend_HDA_c -#include "psblend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPPsScreenBlend_o -#define C_FUNC_NAME TVPPsScreenBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "psblend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef TVPPS_OPERATION - -#define TVPPS_OPERATION "ps_overlayblend.h" -#define TVPPS_PREPROC uint8x8_t mask80 = vdup_n_u8(0x80), mask1 = vdup_n_u8(1), maskFE = vdup_n_u8(0xFE); -#define FUNC_NAME TVPPsOverlayBlend -#define C_FUNC_NAME TVPPsOverlayBlend_HDA_c -#include "psblend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPPsOverlayBlend_o -#define C_FUNC_NAME TVPPsOverlayBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "psblend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef TVPPS_OPERATION - -#define TVPPS_OPERATION "ps_hardlightblend.h" -#define FUNC_NAME TVPPsHardLightBlend -#define C_FUNC_NAME TVPPsHardLightBlend_HDA_c -#include "psblend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPPsHardLightBlend_o -#define C_FUNC_NAME TVPPsHardLightBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "psblend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef TVPPS_PREPROC -#undef TVPPS_OPERATION - -#define TVPPS_OPERATION "ps_lightenblend.h" -#define FUNC_NAME TVPPsLightenBlend -#define C_FUNC_NAME TVPPsLightenBlend_HDA_c -#include "psblend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPPsLightenBlend_o -#define C_FUNC_NAME TVPPsLightenBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "psblend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef TVPPS_OPERATION - -#define TVPPS_OPERATION "ps_darkenblend.h" -#define FUNC_NAME TVPPsDarkenBlend -#define C_FUNC_NAME TVPPsDarkenBlend_HDA_c -#include "psblend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPPsDarkenBlend_o -#define C_FUNC_NAME TVPPsDarkenBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "psblend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef TVPPS_OPERATION - -#define TVPPS_OPERATION "ps_diffblend.h" -#define FUNC_NAME TVPPsDiffBlend -#define C_FUNC_NAME TVPPsDiffBlend_HDA_c -#include "psblend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPPsDiffBlend_o -#define C_FUNC_NAME TVPPsDiffBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "psblend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef TVPPS_OPERATION - -#define TVPPS_OPERATION "ps_diff5blend.h" -#define FUNC_NAME TVPPsDiff5Blend -#define C_FUNC_NAME TVPPsDiff5Blend_HDA_c -#include "psblend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPPsDiff5Blend_o -#define C_FUNC_NAME TVPPsDiff5Blend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "psblend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef TVPPS_OPERATION - -#define TVPPS_OPERATION "ps_exclusionblend.h" -#define FUNC_NAME TVPPsExclusionBlend -#define C_FUNC_NAME TVPPsExclusionBlend_HDA_c -#include "psblend.h" -#undef C_FUNC_NAME -#undef FUNC_NAME - -#define FUNC_NAME TVPPsExclusionBlend_o -#define C_FUNC_NAME TVPPsExclusionBlend_HDA_o_c -#define BLEND_WITH_OPACITY -#include "psblend.h" -#undef BLEND_WITH_OPACITY -#undef C_FUNC_NAME -#undef FUNC_NAME -#undef TVPPS_OPERATION - -#endif - -#if TVP_TLG6_W_BLOCK_SIZE != 8 -#error TVP_TLG6_W_BLOCK_SIZE must be 8 ! -#endif - -/* - +---+---+ - |lt | t | / min(l, t), if lt >= max(l, t); - +---+---+ ret = | max(l, t), if lt >= min(l, t); - | l |ret| \ l + t - lt, otherwise; - +---+---+ -*/ -#ifdef DEBUG_ARM_NEON -static inline uint8x8_t med_NEON(uint32x2_t l, uint32x2_t t, uint32x2_t lt) -{ - uint8x8_t max_l_t = vmax_u8(vreinterpret_u8_u32(l), vreinterpret_u8_u32(t)); - uint8x8_t min_l_t = vmin_u8(vreinterpret_u8_u32(l), vreinterpret_u8_u32(t)); - return vsub_u8(vadd_u8(max_l_t, min_l_t), vmax_u8(vmin_u8(max_l_t, vreinterpret_u8_u32(lt)), min_l_t)); -} -#else -#define med_NEON(l, t, lt) \ - uint8x8_t max_l_t = vmax_u8(vreinterpret_u8_u32(l), vreinterpret_u8_u32(t));\ - uint8x8_t min_l_t = vmin_u8(vreinterpret_u8_u32(l), vreinterpret_u8_u32(t));\ - uint8x8_t m = vsub_u8(vadd_u8(max_l_t, min_l_t), vmax_u8(vmin_u8(max_l_t, vreinterpret_u8_u32(lt)), min_l_t)); -#endif - -void TVPTLG6DecodeLineGeneric_NEON(tjs_uint32 *prevline, tjs_uint32 *curline, tjs_int width, tjs_int start_block, tjs_int block_limit, tjs_uint8 *filtertypes, tjs_int skipblockbytes, tjs_uint32 *in, tjs_uint32 initialp, tjs_int oddskip, tjs_int dir) -{ - /* - chroma/luminosity decoding - (this does reordering, color correlation filter, MED/AVG at a time) - */ - uint32x2_t p, up; - int step, i; - - if(start_block) - { - prevline += start_block * TVP_TLG6_W_BLOCK_SIZE; - curline += start_block * TVP_TLG6_W_BLOCK_SIZE; - p = vdup_n_u32(curline[-1]); - up = vdup_n_u32(prevline[-1]); - } - else - { - p = vdup_n_u32(initialp); - up = vdup_n_u32(initialp); - } - - in += skipblockbytes * start_block; - step = (dir&1)?1:-1; - - for(i = start_block; i < block_limit; i ++) - { - int w = width - i*TVP_TLG6_W_BLOCK_SIZE, ww; - if(w > TVP_TLG6_W_BLOCK_SIZE) w = TVP_TLG6_W_BLOCK_SIZE; - ww = w; - if(step==-1) in += ww-1; - if(i&1) in += oddskip * ww; - switch(filtertypes[i]) - { -#define IA (char)(clr>>24) -#define IR (char)(clr>>16) -#define IG (char)(clr>>8 ) -#define IB (char)(clr ) -#define TLG6_SET_CLR(R, G, B) (0xff0000 & ((R)<<16)) + (0xff00 & ((G)<<8)) + (0xff & (B)) + ((IA) << 24) - - // TVP_TLG6_DO_CHROMA_DECODE( 0, IB, IG, IR); -#define N 0 -#define FILTER TLG6_SET_CLR(IB, IG, IR) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - - // TVP_TLG6_DO_CHROMA_DECODE( 1, IB+IG, IG, IR+IG); -#define N 1 -#define FILTER TLG6_SET_CLR(IB+IG, IG, IR+IG) - -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - // TVP_TLG6_DO_CHROMA_DECODE( 2, IB, IG+IB, IR+IB+IG); -#define N 2 -#define FILTER TLG6_SET_CLR(IB, IG+IB, IR+IB+IG) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - // TVP_TLG6_DO_CHROMA_DECODE( 3, IB+IR+IG, IG+IR, IR); -#define N 3 -#define FILTER TLG6_SET_CLR(IB+IR+IG, IG+IR, IR) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - // TVP_TLG6_DO_CHROMA_DECODE( 4, IB+IR, IG+IB+IR, IR+IB+IR+IG); -#define N 4 -#define FILTER TLG6_SET_CLR(IB+IR, IG+IB+IR, IR+IB+IR+IG) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - // TVP_TLG6_DO_CHROMA_DECODE( 5, IB+IR, IG+IB+IR, IR); -#define N 5 -#define FILTER TLG6_SET_CLR(IB+IR, IG+IB+IR, IR) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - // TVP_TLG6_DO_CHROMA_DECODE( 6, IB+IG, IG, IR); -#define N 6 -#define FILTER TLG6_SET_CLR(IB+IG, IG, IR) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - // TVP_TLG6_DO_CHROMA_DECODE( 7, IB, IG+IB, IR); -#define N 7 -#define FILTER TLG6_SET_CLR(IB, IG+IB, IR) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - // TVP_TLG6_DO_CHROMA_DECODE( 8, IB, IG, IR+IG); -#define N 8 -#define FILTER TLG6_SET_CLR(IB, IG, IR+IG) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - // TVP_TLG6_DO_CHROMA_DECODE( 9, IB+IG+IR+IB, IG+IR+IB, IR+IB); -#define N 9 -#define FILTER TLG6_SET_CLR(IB+IG+IR+IB, IG+IR+IB, IR+IB) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - // TVP_TLG6_DO_CHROMA_DECODE(10, IB+IR, IG+IR, IR); -#define N 10 -#define FILTER TLG6_SET_CLR(IB+IR, IG+IR, IR) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - // TVP_TLG6_DO_CHROMA_DECODE(11, IB, IG+IB, IR+IB); -#define N 11 -#define FILTER TLG6_SET_CLR(IB, IG+IB, IR+IB) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - // TVP_TLG6_DO_CHROMA_DECODE(12, IB, IG+IR+IB, IR+IB); -#define N 12 -#define FILTER TLG6_SET_CLR(IB, IG+IR+IB, IR+IB) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - // TVP_TLG6_DO_CHROMA_DECODE(13, IB+IG, IG+IR+IB+IG, IR+IB+IG); -#define N 13 -#define FILTER TLG6_SET_CLR(IB+IG, IG+IR+IB+IG, IR+IB+IG) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - // TVP_TLG6_DO_CHROMA_DECODE(14, IB+IG+IR, IG+IR, IR+IB+IG+IR); -#define N 14 -#define FILTER TLG6_SET_CLR(IR+IB+IG, IG+IR, IR+IB+IG+IR) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - // TVP_TLG6_DO_CHROMA_DECODE(15, IB, IG+(IB<<1), IR+(IB<<1)); -#define N 15 -#define FILTER TLG6_SET_CLR(IB, IG+(IB<<1), IR+(IB<<1)) -#include "TLG6_do_chroma.h" -#undef FILTER -#undef N - - default: return; - } - if(step == 1) - in += skipblockbytes - ww; - else - in += skipblockbytes + 1; - if(i&1) in -= oddskip * ww; -#undef IR -#undef IG -#undef IB - } -} - -static void TVPTLG5ComposeColors3To4_NEON(tjs_uint8 *outp, const tjs_uint8 *upper, tjs_uint8 * const * buf, tjs_int width) -{ - const tjs_uint8 * p2 = buf[0]; - const tjs_uint8 * p1 = buf[1]; - const tjs_uint8 * p0 = buf[2]; - int x = 0; - uint8x8x3_t pc; - pc.val[0] = vdup_n_u8(0); - pc.val[1] = vdup_n_u8(0); - pc.val[2] = vdup_n_u8(0); - uint8x8x4_t rgba; - rgba.val[3] = vdup_n_u8(0xFF); - for(x = 0; x < width - 7; x += 8) { - uint8x8x3_t c; - c.val[1] = vld1_u8(p1 + x); - c.val[0] = vadd_u8(vld1_u8(p0 + x), c.val[1]); - c.val[2] = vadd_u8(vld1_u8(p2 + x), c.val[1]); - pc.val[0] = vadd_u8(vdup_n_u8(vget_lane_u8(pc.val[0], 7)), c.val[0]); - pc.val[1] = vadd_u8(vdup_n_u8(vget_lane_u8(pc.val[1], 7)), c.val[1]); - pc.val[2] = vadd_u8(vdup_n_u8(vget_lane_u8(pc.val[2], 7)), c.val[2]); - for(int i = 0; i < 7; ++i) { - c.val[0] = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(c.val[0]), 8)); - c.val[1] = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(c.val[1]), 8)); - c.val[2] = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(c.val[2]), 8)); - pc.val[0] = vadd_u8(pc.val[0], c.val[0]); - pc.val[1] = vadd_u8(pc.val[1], c.val[1]); - pc.val[2] = vadd_u8(pc.val[2], c.val[2]); - } - uint8x8x4_t up = vld4_u8(upper); - rgba.val[0] = vadd_u8(pc.val[0], up.val[0]); - rgba.val[1] = vadd_u8(pc.val[1], up.val[1]); - rgba.val[2] = vadd_u8(pc.val[2], up.val[2]); - vst4_u8(outp, rgba); - outp += 4 * 8; - upper += 4 * 8; - } - - if(x < width) { - tjs_uint8 _pc[3]; - tjs_uint8 _c[3]; - _pc[0] = vget_lane_u8(pc.val[0], 7); - _pc[1] = vget_lane_u8(pc.val[1], 7); - _pc[2] = vget_lane_u8(pc.val[2], 7); - for(; x < width; x++) - { - _c[0] = p0[x]; - _c[1] = p1[x]; - _c[2] = p2[x]; - _c[0] += _c[1]; _c[2] += _c[1]; - *(tjs_uint32 *)outp = - ((((_pc[0] += _c[0]) + upper[0]) & 0xff) ) + - ((((_pc[1] += _c[1]) + upper[1]) & 0xff) << 8) + - ((((_pc[2] += _c[2]) + upper[2]) & 0xff) << 16) + - 0xff000000; - outp += 4; - upper += 4; - } - } -} - -static void TVPTLG5ComposeColors4To4_NEON(tjs_uint8 *outp, const tjs_uint8 *upper, tjs_uint8 * const * buf, tjs_int width) -{ -#ifdef TEST_ARM_NEON_CODE - TVPTLG5ComposeColors4To4_c(outp, upper, buf, width); - tjs_uint8 *orig_outp = outp; - tjs_uint8 *test_outp = outp = new tjs_uint8[width * 4]; -#endif - const tjs_uint8 * p2 = buf[0]; - const tjs_uint8 * p1 = buf[1]; - const tjs_uint8 * p0 = buf[2]; - const tjs_uint8 * p3 = buf[3]; - int x = 0; - uint8x8x4_t pc; - pc.val[0] = vdup_n_u8(0); - pc.val[1] = vdup_n_u8(0); - pc.val[2] = vdup_n_u8(0); - pc.val[3] = vdup_n_u8(0); - for(x = 0; x < width - 7; x += 8) { - uint8x8x4_t c; - c.val[1] = vld1_u8(p1 + x); - c.val[0] = vadd_u8(vld1_u8(p0 + x), c.val[1]); - c.val[2] = vadd_u8(vld1_u8(p2 + x), c.val[1]); - c.val[3] = vld1_u8(p3 + x); - pc.val[0] = vadd_u8(vdup_n_u8(vget_lane_u8(pc.val[0], 7)), c.val[0]); - pc.val[1] = vadd_u8(vdup_n_u8(vget_lane_u8(pc.val[1], 7)), c.val[1]); - pc.val[2] = vadd_u8(vdup_n_u8(vget_lane_u8(pc.val[2], 7)), c.val[2]); - pc.val[3] = vadd_u8(vdup_n_u8(vget_lane_u8(pc.val[3], 7)), c.val[3]); - for(int i = 0; i < 7; ++i) { - c.val[0] = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(c.val[0]), 8)); - c.val[1] = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(c.val[1]), 8)); - c.val[2] = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(c.val[2]), 8)); - c.val[3] = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(c.val[3]), 8)); - pc.val[0] = vadd_u8(pc.val[0], c.val[0]); - pc.val[1] = vadd_u8(pc.val[1], c.val[1]); - pc.val[2] = vadd_u8(pc.val[2], c.val[2]); - pc.val[3] = vadd_u8(pc.val[3], c.val[3]); - } - uint8x8x4_t up = vld4_u8(upper); - uint8x8x4_t rgba; - rgba.val[0] = vadd_u8(pc.val[0], up.val[0]); - rgba.val[1] = vadd_u8(pc.val[1], up.val[1]); - rgba.val[2] = vadd_u8(pc.val[2], up.val[2]); - rgba.val[3] = vadd_u8(pc.val[3], up.val[3]); - vst4_u8(outp, rgba); - outp += 4 * 8; - upper += 4 * 8; - } - - if(x < width) { - tjs_uint8 _pc[4]; - tjs_uint8 _c[4]; - _pc[0] = vget_lane_u8(pc.val[0], 7); - _pc[1] = vget_lane_u8(pc.val[1], 7); - _pc[2] = vget_lane_u8(pc.val[2], 7); - _pc[3] = vget_lane_u8(pc.val[3], 7); - for(; x < width; x++) - { - _c[0] = p0[x]; - _c[1] = p1[x]; - _c[2] = p2[x]; - _c[3] = p3[x]; - _c[0] += _c[1]; _c[2] += _c[1]; - *(tjs_uint32 *)outp = - ((((_pc[0] += _c[0]) + upper[0]) & 0xff) ) + - ((((_pc[1] += _c[1]) + upper[1]) & 0xff) << 8) + - ((((_pc[2] += _c[2]) + upper[2]) & 0xff) << 16) + - ((((_pc[3] += _c[3]) + upper[3]) & 0xff) << 24); - outp += 4; - upper += 4; - } - } -#ifdef TEST_ARM_NEON_CODE - for (int i = 0; i < width * 4; ++i) { - assert(test_outp[i] == orig_outp[i]); - } - delete[]test_outp; -#endif -} - -static tjs_int TVPTLG5DecompressSlide_NEON(tjs_uint8 *out, const tjs_uint8 *in, tjs_int insize, tjs_uint8 *text, tjs_int initialr) { - // test -// std::vector tmp; tmp.resize(1024 * 768 * 4); -// std::vector ttext; ttext.insert(ttext.begin(), text, text + 4096 + 16); -// tjs_uint8 *pout = out; -// tjs_int rr = TVPTLG5DecompressSlide_c(&tmp[0], in, insize, &ttext[0], initialr); - - tjs_int r = initialr; - tjs_uint flags = 0; - const tjs_uint8 *inlim = in + insize; - while (in < inlim) { - if (((flags >>= 1) & 256) == 0) { - flags = in[0] | 0xff00; - in++; - if (flags == 0xff00 && r < (4096 - 8) && in < (inlim - 8)) { // copy 8byte - uint8x8_t c = vld1_u8(in); - vst1_u8(out, c);; - vst1_u8(&text[r], c);; - r += 8; - in += 8; - out += 8; - flags = 0; - continue; - } - } - if (flags & 1) { - tjs_uint16 in16 = *(tjs_uint16*)in; - tjs_uint mpos = in16 & 0xFFF; - tjs_uint mlen = (in16 >> 12) + 3; - in += 2; - if (mlen == 18) - mlen += *in++; - if (mlen > 15 && (mpos - r > 15 || r - mpos > 15)) { - if ((mpos + mlen) < 4096 && (r + mlen) < 4096) { - tjs_int count = mlen >> 4; - while (count--) { - uint8x16_t c = vld1q_u8(&text[mpos]); - vst1q_u8(out, c); - vst1q_u8(&text[r], c); - mpos += 16; r += 16; out += 16; - } - mlen &= 0x0f; // モ爨 - while (mlen--) { - out[0] = text[r++] = text[mpos++]; out++; - } - continue; - } -#if 0 - while (mlen) { - uint8x16_t c = vld1q_u8(&text[mpos]); - vst1q_u8(out, c); - vst1q_u8(&text[r], c); // direct write to text is OK due to the extra 16 bytes - tjs_int next = mlen < 16 ? mlen : 16; - if (mpos + next > 4095) { - next = 4096 - mpos; - mpos = 0; - } else { - mpos += next; - } - out += next; - r += next; - mlen -= next; - if (r > 4095) { - r &= 0x0fff; - vst1q_u8(&text[r - 16], c); - } - } - continue; -#endif - } - while (mlen--) { - out[0] = text[r++] = text[mpos++]; out++; - mpos &= 0x0fff; - r &= 0x0fff; - } - } else { - unsigned char c = in[0]; in++; - out[0] = c; out++; - text[r++] = c; - r &= 0x0fff; - } - } - - // test -// assert(rr == r); -// for (int i = 0; i < out - pout; ++i) { -// assert(tmp[i] == pout[i]); -// } - - return r; -} - -//#include - -static tjs_uint32 *testbuff = NULL; -static tjs_uint32 *testdata1 = NULL; -static tjs_uint32 *testdata2 = NULL; -static tjs_uint32 *testdest1 = NULL; -static tjs_uint32 *testdest2 = NULL; -static tjs_uint32 *testtable = NULL; -static tjs_uint8 *testrule = NULL; -#include -#include - -#ifdef __cplusplus -#define FUNC_API extern "C" -#else -#define FUNC_API -#endif -FUNC_API int TVPShowSimpleMessageBox(const char * text, const char * caption, unsigned int nButton, const char **btnText); -FUNC_API tjs_uint32 TVPGetTickTime(); - -static void ShowInMessageBox(const char *text) { - if (!text || !*text) return; - const char *btnText = "OK"; - TVPShowSimpleMessageBox(text, "Log", 1, &btnText); -} - -static void InitTestData() { - if(!testtable) { - testtable = (tjs_uint32*)malloc(256 * sizeof(tjs_uint32)); - for(int x = 0; x < 256; ++x) { - testtable[x] = rand() & 0xFF; - } - testrule = (tjs_uint8*)malloc(256 * 256); - for(int x = 0; x < 256 * 256; ++x) { - testrule[x] = rand() & 0xFF; - } - testbuff = (tjs_uint32*)malloc((256 * 256 * 4 + 2) * sizeof(tjs_uint32)); - testdata1 = testbuff /*+ 1*/; - testdata2 = testdata1 + 256 * 256; - testdest1 = testdata2 + 256 * 256; - testdest2 = testdest1 + 256 * 256; - } - int obfu = 65531; - for(int x = 0; x < 256; ++x) { - for(int y = 0; y < 256; ++y) { - typedef struct { - unsigned char a; - unsigned char r; - unsigned char g; - unsigned char b; - } clr; - clr *clr1 = (clr*)(testdata1 + 256 * y + x), - *clr2 = (clr*)(testdata2 + 256 * y + x); - clr1->a = 255 - x; clr2->a = 255 - y; - clr1->r = x; clr2->r = y; - clr1->g = y; clr2->g = 255 - x; - clr1->b = 255 - y; clr2->b = x; - if (y == 0) { - clr1->a = obfu; - obfu = obfu * 3 + 29; - clr2->a = obfu; - obfu = obfu * 3 + 29; - clr1->r = obfu; - obfu = obfu * 3 + 29; - clr2->r = obfu; - obfu = obfu * 3 + 29; - clr1->g = obfu; - obfu = obfu * 3 + 29; - clr2->g = obfu; - obfu = obfu * 3 + 29; - clr1->b = obfu; - obfu = obfu * 3 + 29; - clr2->b = obfu; - obfu = obfu * 3 + 29; - } - } - } - memcpy(testdest1, testdata2, 256 * 256 * 4); - memcpy(testdest2, testdata2, 256 * 256 * 4); -} - -#if defined(TEST_ARM_NEON_CODE) -static void CheckTestData(const char *pszFuncName) -{ - typedef union{ - struct { - unsigned char r; - unsigned char g; - unsigned char b; - unsigned char a; - }; - unsigned long u32; - } clr; clr clr1, clr2; - for (int i = 0; i < 256 * 256; ++i) { - clr1.u32 = testdest1[i]; - clr2.u32 = testdest2[i]; - if (clr1.a <= 1 && clr2.a <= 1) continue; - if( abs(clr1.a - clr2.a) > 1 || - abs(clr1.r - clr2.r) > 1 || - abs(clr1.g - clr2.g) > 1 || - abs(clr1.b - clr2.b) > 1 ) - { - char tmp[256]; - sprintf(tmp, "test fail on function %s", pszFuncName); -#ifdef _MSC_VER - cv::Mat test1(256, 256, CV_8UC4, testdest1, 1024); - cv::Mat test2(256, 256, CV_8UC4, testdest2, 1024); - ShowInMessageBox(tmp); -#endif -#if !defined(WIN32) && 0 - const char bmphdr[] = "\x42\x4D\x36\x00\x04\x00\x00\x00\x00\x00\x36\x00\x00\x00\x28\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x01\x00\x20\x00\x00\x00\x00\x00\x00\x00\x00\x00\x12\x0B\x00\x00\x12\x0B\x00\x00\x00\x00\x00\x00\x00\x00\x00"; - FILE* f = fopen("/sdcard/testdest1.bmp", "wb"); - fwrite(bmphdr, sizeof(bmphdr), 1, f); - fwrite(testdest1, 256 * 256, 4, f); - fclose(f); - f = fopen("/sdcard/testdest2.bmp", "wb"); - fwrite(bmphdr, sizeof(bmphdr), 1, f); - fwrite(testdest2, 256 * 256, 4, f); - fclose(f); -#endif - return; - } - } - //SDL_Log("cheking %s pass", pszFuncName); -} -#endif -static void CheckTestData_RGB(const char *pszFuncName) -{ - for (int i = 0; i < 256 * 256; ++i) { - if ((testdest2[i] & 0xFFFFFF) != (testdest1[i] & 0xFFFFFF)) { - char tmp[256]; - sprintf(tmp, "test fail on function %s", pszFuncName); - ShowInMessageBox(tmp); - //assert(!pszFuncName); - return; - } - } - //SDL_Log("cheking %s pass", pszFuncName); -} - -static void testTLG6_chroma() -{ - for (int i = 0; i < 32; ++i) { - tjs_uint8 block_src_ref[32 * 4]; - tjs_uint8 block_src[32 * 4]; - for (int j = 0; j < 32 * 4; ++j) { - block_src_ref[j] = 240 - i - j * 3; - block_src[j] = 16 + i + j * 3; - } - tjs_uint32 testdest1[256]; - tjs_uint32 testdest2[256]; - for (tjs_uint8 ft = 0; ft < 32; ++ft) { - TVPTLG6DecodeLineGeneric_NEON((tjs_uint32 *)block_src_ref, testdest1, 64, 0, 1, &ft, 0, (tjs_uint32 *)block_src, 0, 0, 0); - TVPTLG6DecodeLineGeneric_c((tjs_uint32 *)block_src_ref, testdest2, 64, 0, 1, &ft, 0, (tjs_uint32 *)block_src, 0, 0, 0); - if (memcmp(testdest1, testdest2, 8 * 4) != 0) { - ShowInMessageBox("test fail on function TVPTLG6DecodeLineGeneric"); - assert(0); - } - } - tjs_uint8 *psrc[4] = { - block_src, - block_src + 1, - block_src + 3, - block_src + 2, - }; - - TVPTLG5ComposeColors3To4_NEON((tjs_uint8*)testdest1, block_src_ref, psrc, 67); - TVPTLG5ComposeColors3To4_c((tjs_uint8*)testdest2, block_src_ref, psrc, 67); - if (memcmp(testdest1, testdest2, 8 * 4) != 0) { - ShowInMessageBox("test fail on function TVPTLG5ComposeColors3To4"); - assert(0); - } - TVPTLG5ComposeColors4To4_NEON((tjs_uint8*)testdest1, block_src_ref, psrc, 67); - TVPTLG5ComposeColors4To4_c((tjs_uint8*)testdest2, block_src_ref, psrc, 67); - if (memcmp(testdest1, testdest2, 8 * 4) != 0) { - ShowInMessageBox("test fail on function TVPTLG5ComposeColors4To4"); - assert(0); - } - } -} - -#ifdef LOG_NEON_TEST -#define SHOW_AND_CLEAR_LOG ShowInMessageBox(LogData); pLogData = LogData; -#else -#define SHOW_AND_CLEAR_LOG -#endif - -#ifdef TEST_ARM_NEON_CODE - -#define REGISTER_TVPGL_BLEND_FUNC_2(origf, f) \ - InitTestData();\ - origf(testdest2, testdata1, 256 * 256);\ - f = f##_NEON;\ - f##_NEON(testdest1, testdata1, 256 * 256);\ - CheckTestData(#f); -#define REGISTER_TVPGL_BLEND_FUNC(origf, f, ...) \ - InitTestData();\ - origf(testdest2, testdata1, 256 * 256, __VA_ARGS__);\ - f = f##_NEON;\ - f##_NEON(testdest1, testdata1, 256 * 256, __VA_ARGS__);\ - CheckTestData(#f); -#define REGISTER_TVPGL_STRECH_FUNC_2(origf, f) \ - InitTestData();\ - origf(testdest2, 16 * 256, testdata1, 0, 1 << 16);\ - f = f##_NEON;\ - f##_NEON(testdest1, 16 * 256, testdata1, 0, 1 << 16);\ - CheckTestData(#f); -#define REGISTER_TVPGL_STRECH_FUNC(origf, f, ...) \ - InitTestData();\ - origf(testdest2, 16 * 256, testdata1, 0, 1 << 16, __VA_ARGS__);\ - f = f##_NEON;\ - f##_NEON(testdest1, 16 * 256, testdata1, 0, 1 << 16, __VA_ARGS__);\ - CheckTestData(#f); -#define REGISTER_TVPGL_LINTRANS_FUNC_2(origf, f) \ - InitTestData();\ - origf(testdest2, 8 * 256, testdata1, 0, 0, 1<<16, 1<<16, 64);\ - f = f##_NEON;\ - f##_NEON(testdest1, 8 * 256, testdata1, 0, 0, 1<<16, 1<<16, 64);\ - CheckTestData(#f); -#define REGISTER_TVPGL_LINTRANS_FUNC(origf, f, ...) \ - InitTestData();\ - origf(testdest2, 8 * 256, testdata1, 0, 0, 1<<16, 1<<16, 64, __VA_ARGS__);\ - f = f##_NEON;\ - f##_NEON(testdest1, 8 * 256, testdata1, 0, 0, 1<<16, 1<<16, 64, __VA_ARGS__);\ - CheckTestData(#f); -#define REGISTER_TVPGL_UNIVTRANS_FUNC(origf, f) \ - InitTestData();\ - origf(testdest2, testdata1, testdata2, testrule, testtable, 256 * 256);\ - f = f##_NEON;\ - f##_NEON(testdest1, testdata1, testdata2, testrule, testtable, 256 * 256);\ - CheckTestData_RGB(#f); -#define REGISTER_TVPGL_CUSTOM_FUNC(origf, f, ...) \ - InitTestData();\ - origf(testdest2, __VA_ARGS__);\ - f = f##_NEON;\ - f##_NEON(testdest1, __VA_ARGS__);\ - CheckTestData(#f); -#define REGISTER_TVPGL_CUSTOM_FUNC_RGB(origf, f, ...) \ - InitTestData();\ - origf(testdest2, __VA_ARGS__);\ - f = f##_NEON;\ - f##_NEON(testdest1, __VA_ARGS__);\ - CheckTestData_RGB(#f); -#define REGISTER_TVPGL_CUSTOM_FUNC_TYPE(origf, f, DT, ...) \ - InitTestData();\ - origf((DT)testdest2, __VA_ARGS__);\ - f = f##_NEON;\ - f##_NEON((DT)testdest1, __VA_ARGS__);\ - CheckTestData(#f); -#else -#ifdef LOG_NEON_TEST - -static tjs_uint32 lastTick1, lastTick2; -static tjs_int tickC, tickNEON; -static unsigned int LogDataSize = 1024; -static char *LogData, *pLogData; - -static void AddLog(const char *format, ...) { - va_list args; - va_start(args, format); - char buf[256]; - vsnprintf(buf, 256 - 3, format, args); - char *p = buf; - if (!LogData) { - LogData = (char*)malloc(LogDataSize); - pLogData = LogData; - } - - while (*p) { - if (LogData + LogDataSize - pLogData <= 2) { - int used = pLogData - LogData; - LogDataSize += 1024; - LogData = (char*)realloc(LogData, LogDataSize); - pLogData = LogData + used; - } - *pLogData++ = *p++; - } - *pLogData++ = '\n'; - *pLogData = '\0'; - - - va_end(args); -} -#ifdef _MSC_VER -#define TEST_COUNT 0 -#else -#define TEST_COUNT 200 -#endif - -static void logTLG6_chroma() { - if (!TEST_COUNT) return; - tickC = 0; tickNEON = 0; - for (int i = 0; i < 32; ++i) { - tjs_uint8 block_src_ref[32 * 4]; - tjs_uint8 block_src[32 * 4]; - for (int j = 0; j < 32 * 4; ++j) { - block_src_ref[j] = 240 - i - j * 3; - block_src[j] = 16 + i + j * 3; - } - tjs_uint32 testdest1[256]; - tjs_uint32 testdest2[256]; - lastTick1 = TVPGetTickTime(); - for (int n = 0; n < TEST_COUNT * 4; ++n) - for (tjs_uint8 ft = 0; ft < 32; ++ft) { - TVPTLG6DecodeLineGeneric_c((tjs_uint32 *)block_src_ref, testdest2, 64, 0, 1, &ft, 0, (tjs_uint32 *)block_src, 0, 0, 0); - } - tickC += TVPGetTickTime() - lastTick1; - lastTick1 = TVPGetTickTime(); - for (int n = 0; n < TEST_COUNT * 4; ++n) - for (tjs_uint8 ft = 0; ft < 32; ++ft) { - TVPTLG6DecodeLineGeneric_NEON((tjs_uint32 *)block_src_ref, testdest2, 64, 0, 1, &ft, 0, (tjs_uint32 *)block_src, 0, 0, 0); - } - tickNEON += TVPGetTickTime() - lastTick1; - } - AddLog("%s: %d ms, NEON: %d ms(%g%%)", "TVPTLG6DecodeLineGeneric", tickC, tickNEON, (float)tickNEON / tickC * 100); - - tickC = 0; tickNEON = 0; - for (int i = 0; i < 32; ++i) { - tjs_uint8 block_src_ref[32 * 4]; - tjs_uint8 block_src[32 * 4]; - for (int j = 0; j < 32 * 4; ++j) { - block_src_ref[j] = 240 - i - j * 3; - block_src[j] = 16 + i + j * 3; - } - tjs_uint32 testdest1[256]; - tjs_uint32 testdest2[256]; - - tjs_uint8 *psrc[4] = { - block_src, - block_src + 1, - block_src + 3, - block_src + 2, - }; - - lastTick1 = TVPGetTickTime(); - for (int n = 0; n < TEST_COUNT * 16; ++n) TVPTLG5ComposeColors3To4_c((tjs_uint8*)testdest2, block_src_ref, psrc, 67); - tickC += TVPGetTickTime() - lastTick1; - lastTick1 = TVPGetTickTime(); - for (int n = 0; n < TEST_COUNT * 16; ++n) TVPTLG5ComposeColors3To4_NEON((tjs_uint8*)testdest1, block_src_ref, psrc, 67); - tickNEON += TVPGetTickTime() - lastTick1; - } - AddLog("%s: %d ms, NEON: %d ms(%g%%)", "TVPTLG5ComposeColors3To4", tickC, tickNEON, (float)tickNEON / tickC * 100); - - tickC = 0; tickNEON = 0; - for (int i = 0; i < 32; ++i) { - tjs_uint8 block_src_ref[32 * 4]; - tjs_uint8 block_src[32 * 4]; - for (int j = 0; j < 32 * 4; ++j) { - block_src_ref[j] = 240 - i - j * 3; - block_src[j] = 16 + i + j * 3; - } - tjs_uint32 testdest1[256]; - tjs_uint32 testdest2[256]; - - tjs_uint8 *psrc[4] = { - block_src, - block_src + 1, - block_src + 3, - block_src + 2, - }; - - lastTick1 = TVPGetTickTime(); - for (int n = 0; n < TEST_COUNT * 16; ++n) TVPTLG5ComposeColors4To4_c((tjs_uint8*)testdest2, block_src_ref, psrc, 67); - tickC += TVPGetTickTime() - lastTick1; - lastTick1 = TVPGetTickTime(); - for (int n = 0; n < TEST_COUNT * 16; ++n) TVPTLG5ComposeColors4To4_NEON((tjs_uint8*)testdest1, block_src_ref, psrc, 67); - tickNEON += TVPGetTickTime() - lastTick1; - } - AddLog("%s: %d ms, NEON: %d ms(%g%%)", "TVPTLG5ComposeColors4To4", tickC, tickNEON, (float)tickNEON / tickC * 100); -} - -#define REGISTER_TVPGL_BLEND_FUNC_2(origf, f) \ - InitTestData();\ - origf(testdest2, testdata1, 256 * 256);\ - f = f##_NEON;\ - f##_NEON(testdest1, testdata1, 256 * 256);\ - CheckTestData(#f); if (TEST_COUNT) {\ - InitTestData();\ - lastTick1 = TVPGetTickTime();\ - for (int i = 0; i < TEST_COUNT; ++i) origf(testdest2, testdata1, 256 * 256); \ - lastTick2 = TVPGetTickTime();\ - for (int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, testdata1, 256 * 256); \ - AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, (tickC = lastTick2 - lastTick1), (tickNEON = TVPGetTickTime() - lastTick2), (float)tickNEON / tickC * 100); \ - f = f##_NEON; } - -#define REGISTER_TVPGL_BLEND_FUNC(origf, f, ...) \ - InitTestData();\ - origf(testdest2, testdata1, 256 * 256, __VA_ARGS__);\ - f = f##_NEON;\ - f##_NEON(testdest1, testdata1, 256 * 256, __VA_ARGS__);\ - CheckTestData(#f); if (TEST_COUNT) {\ - InitTestData();\ - lastTick1 = TVPGetTickTime();\ - for (int i = 0; i < TEST_COUNT; ++i) origf(testdest2, testdata1, 256 * 256, __VA_ARGS__); \ - lastTick2 = TVPGetTickTime();\ - for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, testdata1, 256 * 256, __VA_ARGS__);\ - AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, (tickC = lastTick2 - lastTick1), (tickNEON = TVPGetTickTime() - lastTick2), (float)tickNEON / tickC * 100); \ - f = f##_NEON; } - -#define REGISTER_TVPGL_STRECH_FUNC_2(origf, f, ...) \ - InitTestData();\ - origf(testdest2, 127 * 256, testdata1, 0, 1 << 16);\ - f = f##_NEON;\ - f##_NEON(testdest1, 127 * 256, testdata1, 0, 1 << 16);\ - CheckTestData(#f); if (TEST_COUNT) {\ - InitTestData();\ - lastTick1 = TVPGetTickTime();\ - for(int i = 0; i < TEST_COUNT; ++i) origf(testdest2, 127 * 256, testdata1, 0, 1 << 16); \ - lastTick2 = TVPGetTickTime(); \ - for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, 127 * 256, testdata1, 0, 1 << 16);\ - AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, (tickC = lastTick2 - lastTick1), (tickNEON = TVPGetTickTime() - lastTick2), (float)tickNEON / tickC * 100); \ - f = f##_NEON; } -#define REGISTER_TVPGL_STRECH_FUNC(origf, f, ...) \ - InitTestData();\ - origf(testdest2, 127 * 256, testdata1, 0, 1 << 16, __VA_ARGS__);\ - f = f##_NEON;\ - f##_NEON(testdest1, 127 * 256, testdata1, 0, 1 << 16, __VA_ARGS__);\ - CheckTestData(#f); if (TEST_COUNT) {\ - InitTestData();\ - lastTick1 = TVPGetTickTime();\ - for(int i = 0; i < TEST_COUNT; ++i) origf(testdest2, 127 * 256, testdata1, 0, 1 << 16, __VA_ARGS__);\ - lastTick2 = TVPGetTickTime(); \ - for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, 127 * 256, testdata1, 0, 1 << 16, __VA_ARGS__);\ - AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, (tickC = lastTick2 - lastTick1), (tickNEON = TVPGetTickTime() - lastTick2), (float)tickNEON / tickC * 100); \ - f = f##_NEON; } -#define REGISTER_TVPGL_STRECH_FUNC_0(origf, f) \ - InitTestData();\ - origf(testdest2, 127 * 256, testdata1, 0, 1 << 16);\ - f = f##_NEON;\ - f##_NEON(testdest1, 127 * 256, testdata1, 0, 1 << 16);\ - CheckTestData(#f); if (TEST_COUNT) {\ - InitTestData();\ - lastTick1 = TVPGetTickTime();\ - for(int i = 0; i < TEST_COUNT; ++i) origf(testdest2, 127 * 256, testdata1, 0, 1 << 16);\ - lastTick2 = TVPGetTickTime(); \ - for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, 127 * 256, testdata1, 0, 1 << 16);\ - AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, (tickC = lastTick2 - lastTick1), (tickNEON = TVPGetTickTime() - lastTick2), (float)tickNEON / tickC * 100); \ - f = f##_NEON; } -#define REGISTER_TVPGL_LINTRANS_FUNC_2(origf, f) \ - InitTestData();\ - origf(testdest2, 127 * 256, testdata1, 0, 0, 1 << 16, 0, 256); \ - f = f##_NEON;\ - f##_NEON(testdest1, 127 * 256, testdata1, 0, 0, 1 << 16, 0, 256); \ - CheckTestData(#f); if (TEST_COUNT) {\ - InitTestData();\ - lastTick1 = TVPGetTickTime();\ - for (int i = 0; i < TEST_COUNT; ++i) origf(testdest2, 127 * 256, testdata1, 0, 0, 1 << 16, 0, 256); \ - lastTick2 = TVPGetTickTime(); \ - for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, 127 * 256, testdata1, 0, 0, 1<<16, 0, 256);\ - AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, (tickC = lastTick2 - lastTick1), (tickNEON = TVPGetTickTime() - lastTick2), (float)tickNEON / tickC * 100); \ - f = f##_NEON; } -#define REGISTER_TVPGL_LINTRANS_FUNC(origf, f, ...) \ - InitTestData();\ - origf(testdest2, 127 * 256, testdata1, 0, 0, 1<<16, 0, 256, __VA_ARGS__);\ - f = f##_NEON;\ - f##_NEON(testdest1, 127 * 256, testdata1, 0, 0, 1<<16, 0, 256, __VA_ARGS__);\ - CheckTestData(#f); if (TEST_COUNT) {\ - InitTestData();\ - lastTick1 = TVPGetTickTime();\ - for(int i = 0; i < TEST_COUNT; ++i) origf(testdest2, 127 * 256, testdata1, 0, 0, 1<<16, 0, 256, __VA_ARGS__);\ - lastTick2 = TVPGetTickTime(); \ - for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, 127 * 256, testdata1, 0, 0, 1<<16, 0, 256, __VA_ARGS__);\ - AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, (tickC = lastTick2 - lastTick1), (tickNEON = TVPGetTickTime() - lastTick2), (float)tickNEON / tickC * 100); \ - f = f##_NEON; } -#define REGISTER_TVPGL_UNIVTRANS_FUNC(origf, f, ...) \ - InitTestData();\ - origf(testdest2, testdata1, testdata2, testrule, testtable, 256 * 256);\ - f = f##_NEON;\ - f##_NEON(testdest1, testdata1, testdata2, testrule, testtable, 256 * 256);\ - CheckTestData(#f); if (TEST_COUNT) {\ - InitTestData();\ - lastTick1 = TVPGetTickTime();\ - for(int i = 0; i < TEST_COUNT; ++i) origf(testdest2, testdata1, testdata2, testrule, testtable, 256 * 256);\ - lastTick2 = TVPGetTickTime(); \ - for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, testdata1, testdata2, testrule, testtable, 256 * 256);\ - AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, (tickC = lastTick2 - lastTick1), (tickNEON = TVPGetTickTime() - lastTick2), (float)tickNEON / tickC * 100); \ - f = f##_NEON; } -#define REGISTER_TVPGL_CUSTOM_FUNC(origf, f, ...) \ - InitTestData();\ - origf(testdest2, __VA_ARGS__);\ - f = f##_NEON;\ - f##_NEON(testdest1, __VA_ARGS__);\ - CheckTestData(#f); if (TEST_COUNT) {\ - InitTestData();\ - lastTick1 = TVPGetTickTime();\ - for(int i = 0; i < TEST_COUNT; ++i) origf(testdest2, __VA_ARGS__);\ - lastTick2 = TVPGetTickTime(); \ - for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, __VA_ARGS__);\ - AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, (tickC = lastTick2 - lastTick1), (tickNEON = TVPGetTickTime() - lastTick2), (float)tickNEON / tickC * 100); \ - f = f##_NEON; } -#define REGISTER_TVPGL_CUSTOM_FUNC_RGB(origf, f, ...) \ - InitTestData();\ - origf(testdest2, __VA_ARGS__);\ - f = f##_NEON;\ - f##_NEON(testdest1, __VA_ARGS__);\ - CheckTestData_RGB(#f); if (TEST_COUNT) {\ - InitTestData();\ - lastTick1 = TVPGetTickTime();\ - for(int i = 0; i < TEST_COUNT; ++i) origf(testdest2, __VA_ARGS__);\ - lastTick2 = TVPGetTickTime(); \ - for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, __VA_ARGS__);\ - AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, (tickC = lastTick2 - lastTick1), (tickNEON = TVPGetTickTime() - lastTick2), (float)tickNEON / tickC * 100); \ - f = f##_NEON; } -#define REGISTER_TVPGL_CUSTOM_FUNC_TYPE(origf, f, DT, ...) \ - InitTestData();\ - origf((DT)testdest2, __VA_ARGS__);\ - f = f##_NEON;\ - f##_NEON((DT)testdest1, __VA_ARGS__);\ - CheckTestData(#f); if (TEST_COUNT) {\ - InitTestData(); \ - lastTick1 = TVPGetTickTime(); \ - for (int i = 0; i < TEST_COUNT; ++i) origf((DT)testdest2, __VA_ARGS__); \ - lastTick2 = TVPGetTickTime(); \ - for (int i = 0; i < TEST_COUNT; ++i) f##_NEON((DT)testdest1, __VA_ARGS__);\ - AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, (tickC = lastTick2 - lastTick1), (tickNEON = TVPGetTickTime() - lastTick2), (float)tickNEON / tickC * 100); \ - f = f##_NEON; } -#else -#define REGISTER_TVPGL_BLEND_FUNC_2(origf, f, ...) f = f##_NEON; -#define REGISTER_TVPGL_BLEND_FUNC(origf, f, ...) f = f##_NEON; -#define REGISTER_TVPGL_STRECH_FUNC_2(origf, f, ...) f = f##_NEON; -#define REGISTER_TVPGL_STRECH_FUNC(origf, f, ...) f = f##_NEON; -#define REGISTER_TVPGL_LINTRANS_FUNC_2(origf, f, ...) f = f##_NEON; -#define REGISTER_TVPGL_LINTRANS_FUNC(origf, f, ...) f = f##_NEON; -#define REGISTER_TVPGL_UNIVTRANS_FUNC(origf, f, ...) f = f##_NEON; -#define REGISTER_TVPGL_CUSTOM_FUNC(origf, f, ...) f = f##_NEON; -#define REGISTER_TVPGL_CUSTOM_FUNC_RGB(origf, f, ...) f = f##_NEON; -#define REGISTER_TVPGL_CUSTOM_FUNC_TYPE(origf, f, ...) f = f##_NEON; -#endif -#endif -#define REGISTER_TVPGL_ONLY(origf, f) origf = f; - -#endif - -FUNC_API void calcBezierPatch_c(float* result, /*const */float* arr/*16*/, /*const */float* a3); -FUNC_API void calcBezierPatch_NEON(float* result, float* arr/*16*/, float* p); - -FUNC_API void TVPGL_ASM_Init() -{ - if ((TVPCPUFeatures & TVP_CPU_FAMILY_MASK) == TVP_CPU_FAMILY_ARM && (TVPCPUFeatures & TVP_CPU_HAS_NEON)) - { - TVPInitTVPGL(); -#ifdef LOG_NEON_TEST -#if 0 - do { // test calcBezierPatch - float arr[32]; - float resultC[2], resultNEON[2], - pt[2] = { - ((rand() & 1) ? -1 : 1) * ((float)rand() / rand()), - ((rand() & 1) ? -1 : 1) * ((float)rand() / rand()) - }; - for (int i = 0; i < 32; ++i) { - arr[i] = ((rand() & 1) ? -1 : 1) * ((float)rand() / rand()); - } - calcBezierPatch_c(resultC, arr, pt); - calcBezierPatch_NEON(resultNEON, arr, pt); - if (resultC[0] != resultNEON[0] || resultC[1] != resultNEON[1]) { - ShowInMessageBox("test calcBezierPatch fail"); - } - if (!TEST_COUNT) break; - for (int i = 0; i < 4; ++i) { - lastTick1 = TVPGetTickTime(); - for (int i = 0; i < 160000; ++i) calcBezierPatch_c(resultC, arr, pt); - lastTick2 = TVPGetTickTime(); - for (int i = 0; i < 160000; ++i) calcBezierPatch_NEON(resultNEON, arr, pt); - AddLog("calcBezierPatch: %d ms, NEON: %d ms(%g%%)", (tickC = lastTick2 - lastTick1), (tickNEON = TVPGetTickTime() - lastTick2), (float)tickNEON / tickC * 100); - } - SHOW_AND_CLEAR_LOG; - } while (0); -#endif -#undef TEST_COUNT -#define TEST_COUNT 1000 - usleep(1000000 * 3); - - REGISTER_TVPGL_BLEND_FUNC_2(TVPCopyMask_c, TVPCopyMask); - REGISTER_TVPGL_BLEND_FUNC_2(TVPCopyMask_c, TVPCopyMask); - REGISTER_TVPGL_BLEND_FUNC_2(TVPCopyMask_c, TVPCopyMask); - REGISTER_TVPGL_BLEND_FUNC_2(TVPCopyMask_c, TVPCopyMask); - REGISTER_TVPGL_BLEND_FUNC_2(TVPCopyColor_c, TVPCopyColor); - REGISTER_TVPGL_BLEND_FUNC_2(TVPCopyColor_c, TVPCopyColor); - REGISTER_TVPGL_BLEND_FUNC_2(TVPCopyColor_c, TVPCopyColor); - REGISTER_TVPGL_BLEND_FUNC_2(TVPCopyColor_c, TVPCopyColor); - SHOW_AND_CLEAR_LOG; - -#undef TEST_COUNT -#define TEST_COUNT 200 - - REGISTER_TVPGL_UNIVTRANS_FUNC(TVPUnivTransBlend_d_c, TVPUnivTransBlend_d); - REGISTER_TVPGL_UNIVTRANS_FUNC(TVPUnivTransBlend_d_c, TVPUnivTransBlend_d); - REGISTER_TVPGL_UNIVTRANS_FUNC(TVPUnivTransBlend_d_c, TVPUnivTransBlend_d); - REGISTER_TVPGL_UNIVTRANS_FUNC(TVPUnivTransBlend_d_c, TVPUnivTransBlend_d); - REGISTER_TVPGL_BLEND_FUNC(TVPColorDodgeBlend_HDA_o_c, TVPColorDodgeBlend_o, 100); - REGISTER_TVPGL_BLEND_FUNC(TVPColorDodgeBlend_HDA_o_c, TVPColorDodgeBlend_o, 100); - REGISTER_TVPGL_BLEND_FUNC(TVPColorDodgeBlend_HDA_o_c, TVPColorDodgeBlend_o, 100); - REGISTER_TVPGL_BLEND_FUNC(TVPColorDodgeBlend_HDA_o_c, TVPColorDodgeBlend_o, 100); - SHOW_AND_CLEAR_LOG; -#endif - // use NEON-optimized routines - //_Initialize_Route_Ptr(); -#if 1 - REGISTER_TVPGL_BLEND_FUNC_2(TVPAlphaBlend_HDA, TVPAlphaBlend); - REGISTER_TVPGL_ONLY(TVPAlphaBlend_HDA, TVPAlphaBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPAlphaBlend_HDA_o, TVPAlphaBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPAlphaBlend_HDA_o, TVPAlphaBlend_o_NEON); - REGISTER_TVPGL_BLEND_FUNC_2(TVPAlphaBlend_d, TVPAlphaBlend_d); - REGISTER_TVPGL_BLEND_FUNC_2(TVPAlphaBlend_a, TVPAlphaBlend_a); - REGISTER_TVPGL_BLEND_FUNC(TVPAlphaBlend_do, TVPAlphaBlend_do, 100); - REGISTER_TVPGL_BLEND_FUNC(TVPAlphaBlend_ao, TVPAlphaBlend_ao, 100); - - REGISTER_TVPGL_CUSTOM_FUNC(TVPAlphaColorMat, TVPAlphaColorMat, 0x98765432, 256 * 256); - REGISTER_TVPGL_BLEND_FUNC_2(TVPAdditiveAlphaBlend_HDA, TVPAdditiveAlphaBlend); - REGISTER_TVPGL_ONLY(TVPAdditiveAlphaBlend_HDA, TVPAdditiveAlphaBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPAdditiveAlphaBlend_HDA_o, TVPAdditiveAlphaBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPAdditiveAlphaBlend_HDA_o, TVPAdditiveAlphaBlend_o_NEON); - REGISTER_TVPGL_BLEND_FUNC_2(TVPAdditiveAlphaBlend_a, TVPAdditiveAlphaBlend_a); - REGISTER_TVPGL_BLEND_FUNC(TVPAdditiveAlphaBlend_ao, TVPAdditiveAlphaBlend_ao, 100); - - SHOW_AND_CLEAR_LOG; - - REGISTER_TVPGL_CUSTOM_FUNC(TVPConvertAlphaToAdditiveAlpha, TVPConvertAlphaToAdditiveAlpha, 256 * 256); - REGISTER_TVPGL_STRECH_FUNC_2(TVPStretchAlphaBlend_HDA, TVPStretchAlphaBlend); - REGISTER_TVPGL_ONLY(TVPStretchAlphaBlend_HDA, TVPStretchAlphaBlend_NEON); - REGISTER_TVPGL_STRECH_FUNC(TVPStretchAlphaBlend_HDA_o, TVPStretchAlphaBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPStretchAlphaBlend_HDA_o, TVPStretchAlphaBlend_o_NEON); - REGISTER_TVPGL_STRECH_FUNC_2(TVPStretchAlphaBlend_d, TVPStretchAlphaBlend_d); - REGISTER_TVPGL_STRECH_FUNC_2(TVPStretchAlphaBlend_a, TVPStretchAlphaBlend_a); - REGISTER_TVPGL_STRECH_FUNC(TVPStretchAlphaBlend_do, TVPStretchAlphaBlend_do, 100); - REGISTER_TVPGL_STRECH_FUNC(TVPStretchAlphaBlend_ao, TVPStretchAlphaBlend_ao, 100); - - REGISTER_TVPGL_STRECH_FUNC_2(TVPStretchAdditiveAlphaBlend_HDA, TVPStretchAdditiveAlphaBlend); - REGISTER_TVPGL_ONLY(TVPStretchAdditiveAlphaBlend_HDA, TVPStretchAlphaBlend_NEON); - REGISTER_TVPGL_STRECH_FUNC(TVPStretchAdditiveAlphaBlend_HDA_o, TVPStretchAdditiveAlphaBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPStretchAdditiveAlphaBlend_HDA_o, TVPStretchAlphaBlend_o_NEON); - REGISTER_TVPGL_STRECH_FUNC_2(TVPStretchAdditiveAlphaBlend_a, TVPStretchAdditiveAlphaBlend_a); - REGISTER_TVPGL_STRECH_FUNC(TVPStretchAdditiveAlphaBlend_ao, TVPStretchAdditiveAlphaBlend_ao, 100); - REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPInterpStretchAdditiveAlphaBlend, TVPInterpStretchAdditiveAlphaBlend, - 16 * 256, testdata1, testdata2, 127, 0, 1<<16); - REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPInterpStretchAdditiveAlphaBlend_o, TVPInterpStretchAdditiveAlphaBlend_o, - 16 * 256, testdata1, testdata2, 127, 0, 1<<16, 100); - - SHOW_AND_CLEAR_LOG; - - REGISTER_TVPGL_LINTRANS_FUNC_2(TVPLinTransAlphaBlend_HDA, TVPLinTransAlphaBlend); - REGISTER_TVPGL_ONLY(TVPLinTransAlphaBlend_HDA, TVPLinTransAlphaBlend_NEON); - REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransAlphaBlend_HDA_o, TVPLinTransAlphaBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPLinTransAlphaBlend_HDA_o, TVPLinTransAlphaBlend_o_NEON); - //REGISTER_TVPGL_LINTRANS_FUNC_2(TVPLinTransAlphaBlend_d, TVPLinTransAlphaBlend_d); // performance issue ! - REGISTER_TVPGL_LINTRANS_FUNC_2(TVPLinTransAlphaBlend_a, TVPLinTransAlphaBlend_a); - REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransAlphaBlend_do, TVPLinTransAlphaBlend_do, 100); - REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransAlphaBlend_ao, TVPLinTransAlphaBlend_ao, 100); - - REGISTER_TVPGL_LINTRANS_FUNC_2(TVPLinTransAdditiveAlphaBlend_HDA, TVPLinTransAdditiveAlphaBlend); - REGISTER_TVPGL_ONLY(TVPLinTransAdditiveAlphaBlend_HDA, TVPLinTransAdditiveAlphaBlend_NEON); - REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransAdditiveAlphaBlend_HDA_o, TVPLinTransAdditiveAlphaBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPLinTransAdditiveAlphaBlend_HDA_o, TVPLinTransAdditiveAlphaBlend_o_NEON); - REGISTER_TVPGL_LINTRANS_FUNC_2(TVPLinTransAdditiveAlphaBlend_a, TVPLinTransAdditiveAlphaBlend_a); - REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransAdditiveAlphaBlend_ao, TVPLinTransAdditiveAlphaBlend_ao, 100); - REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPInterpLinTransAdditiveAlphaBlend, TVPInterpLinTransAdditiveAlphaBlend, - 8 * 256, testdata1, 0, 0, 1<<16, 1<<16, 64); - REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPInterpLinTransAdditiveAlphaBlend_o, TVPInterpLinTransAdditiveAlphaBlend_o, - 8 * 256, testdata1, 0, 0, 1 << 16, 1 << 16, 64, 100); - - SHOW_AND_CLEAR_LOG; - - REGISTER_TVPGL_BLEND_FUNC_2(TVPCopyOpaqueImage, TVPCopyOpaqueImage); - REGISTER_TVPGL_BLEND_FUNC(TVPConstAlphaBlend_HDA, TVPConstAlphaBlend, 100); - REGISTER_TVPGL_ONLY(TVPConstAlphaBlend_HDA, TVPConstAlphaBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPConstAlphaBlend_d, TVPConstAlphaBlend_d, 100); - REGISTER_TVPGL_BLEND_FUNC(TVPConstAlphaBlend_a, TVPConstAlphaBlend_a, 100); - - //REGISTER_TVPGL_STRECH_FUNC_0(TVPStretchCopyOpaqueImage, TVPStretchCopyOpaqueImage); // performance issue - REGISTER_TVPGL_STRECH_FUNC(TVPStretchConstAlphaBlend_HDA, TVPStretchConstAlphaBlend, 100); - REGISTER_TVPGL_ONLY(TVPStretchConstAlphaBlend_HDA, TVPStretchConstAlphaBlend_NEON); - REGISTER_TVPGL_STRECH_FUNC(TVPStretchConstAlphaBlend_d, TVPStretchConstAlphaBlend_d, 100); - REGISTER_TVPGL_STRECH_FUNC(TVPStretchConstAlphaBlend_a, TVPStretchConstAlphaBlend_a, 100); - //REGISTER_TVPGL_LINTRANS_FUNC_2(TVPLinTransCopyOpaqueImage, TVPLinTransCopyOpaqueImage); // performance issue ! - REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPInterpStretchConstAlphaBlend, TVPInterpStretchConstAlphaBlend, - 16 * 256, testdata1, testdata2, 127, 0, 1<<16, 100); - //REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransConstAlphaBlend_HDA, TVPLinTransConstAlphaBlend, 100); // performance issue ! - //REGISTER_TVPGL_ONLY(TVPLinTransConstAlphaBlend_HDA, TVPLinTransConstAlphaBlend_NEON); // performance issue ! - //REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransConstAlphaBlend_d, TVPLinTransConstAlphaBlend_d, 100); // performance issue ! - REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransConstAlphaBlend_a, TVPLinTransConstAlphaBlend_a, 100); - //REGISTER_TVPGL_LINTRANS_FUNC(TVPInterpLinTransConstAlphaBlend, TVPInterpLinTransConstAlphaBlend, 100); // performance issue ! - - SHOW_AND_CLEAR_LOG; - - REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPConstAlphaBlend_SD, TVPConstAlphaBlend_SD, testdata1, testdata2, 256 * 256, 100); - REGISTER_TVPGL_CUSTOM_FUNC(TVPConstAlphaBlend_SD_a, TVPConstAlphaBlend_SD_a, testdata1, testdata2, 256 * 256, 100); - REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPConstAlphaBlend_SD_d, TVPConstAlphaBlend_SD_d, testdata1, testdata2, 256 * 256, 100); - - // TVPInitUnivTransBlendTable - REGISTER_TVPGL_UNIVTRANS_FUNC(TVPUnivTransBlend, TVPUnivTransBlend); - REGISTER_TVPGL_UNIVTRANS_FUNC(TVPUnivTransBlend_d, TVPUnivTransBlend_d); - REGISTER_TVPGL_UNIVTRANS_FUNC(TVPUnivTransBlend_a, TVPUnivTransBlend_a); -// REGISTER_TVPGL_UNIVTRANS_FUNC(TVPUnivTransBlend_switch, TVPUnivTransBlend_switch, 240, 32); -// REGISTER_TVPGL_UNIVTRANS_FUNC(TVPUnivTransBlend_switch_d, TVPUnivTransBlend_switch_d, 240, 32); -// REGISTER_TVPGL_UNIVTRANS_FUNC(TVPUnivTransBlend_switch_a, TVPUnivTransBlend_switch_a, 240, 32); - - REGISTER_TVPGL_CUSTOM_FUNC(TVPApplyColorMap_HDA, TVPApplyColorMap, testrule, 256 * 256, 0x55d20688); - REGISTER_TVPGL_ONLY(TVPApplyColorMap_HDA, TVPApplyColorMap_NEON); - REGISTER_TVPGL_CUSTOM_FUNC(TVPApplyColorMap_HDA_o, TVPApplyColorMap_o, testrule, 256 * 256, 0x55d20688, 100); - REGISTER_TVPGL_ONLY(TVPApplyColorMap_HDA_o, TVPApplyColorMap_o_NEON); - REGISTER_TVPGL_CUSTOM_FUNC(TVPApplyColorMap_d, TVPApplyColorMap_d, testrule, 256 * 256, 0x55d20688); - REGISTER_TVPGL_CUSTOM_FUNC(TVPApplyColorMap_a, TVPApplyColorMap_a, testrule, 256 * 256, 0x55d20688); - REGISTER_TVPGL_CUSTOM_FUNC(TVPApplyColorMap_do, TVPApplyColorMap_do, testrule, 256 * 256, 0x55d20688, 100); - REGISTER_TVPGL_CUSTOM_FUNC(TVPApplyColorMap_ao, TVPApplyColorMap_ao, testrule, 256 * 256, 0x55d20688, 100); - - SHOW_AND_CLEAR_LOG; - - REGISTER_TVPGL_CUSTOM_FUNC(TVPConstColorAlphaBlend, TVPConstColorAlphaBlend, 256 * 256, 0x55d20688, 100); - REGISTER_TVPGL_CUSTOM_FUNC(TVPConstColorAlphaBlend_d, TVPConstColorAlphaBlend_d, 256 * 256, 0x55d20688, 100); - REGISTER_TVPGL_CUSTOM_FUNC(TVPConstColorAlphaBlend_a, TVPConstColorAlphaBlend_a, 256 * 256, 0x55d20688, 100); - - REGISTER_TVPGL_CUSTOM_FUNC(TVPRemoveConstOpacity, TVPRemoveConstOpacity, 256 * 256, 100); - REGISTER_TVPGL_CUSTOM_FUNC(TVPRemoveOpacity, TVPRemoveOpacity, testrule, 255 * 256); - REGISTER_TVPGL_CUSTOM_FUNC(TVPRemoveOpacity_o, TVPRemoveOpacity_o, testrule, 255 * 256, 100); - - REGISTER_TVPGL_BLEND_FUNC_2(TVPAddBlend, TVPAddBlend); - REGISTER_TVPGL_BLEND_FUNC_2(TVPAddBlend_HDA, TVPAddBlend_HDA); - REGISTER_TVPGL_BLEND_FUNC(TVPAddBlend_HDA_o, TVPAddBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPAddBlend_HDA_o, TVPAddBlend_o_NEON); - - REGISTER_TVPGL_BLEND_FUNC_2(TVPSubBlend, TVPSubBlend); - REGISTER_TVPGL_BLEND_FUNC_2(TVPSubBlend_HDA, TVPSubBlend_HDA); - REGISTER_TVPGL_BLEND_FUNC(TVPSubBlend_HDA_o, TVPSubBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPSubBlend_HDA_o, TVPSubBlend_o_NEON); - - REGISTER_TVPGL_BLEND_FUNC_2(TVPMulBlend_HDA, TVPMulBlend_HDA); - REGISTER_TVPGL_BLEND_FUNC_2(TVPMulBlend, TVPMulBlend); - REGISTER_TVPGL_BLEND_FUNC(TVPMulBlend_HDA_o, TVPMulBlend_HDA_o, 100); - REGISTER_TVPGL_BLEND_FUNC(TVPMulBlend_o, TVPMulBlend_o, 100); - - SHOW_AND_CLEAR_LOG; - -// REGISTER_TVPGL_BLEND_FUNC_2(TVPColorDodgeBlend_HDA, TVPColorDodgeBlend); // performance issue -// REGISTER_TVPGL_ONLY(TVPColorDodgeBlend_HDA, TVPColorDodgeBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPColorDodgeBlend_HDA_o, TVPColorDodgeBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPColorDodgeBlend_HDA_o, TVPColorDodgeBlend_o_NEON); - REGISTER_TVPGL_BLEND_FUNC_2(TVPDarkenBlend_HDA, TVPDarkenBlend); - REGISTER_TVPGL_ONLY(TVPDarkenBlend_HDA, TVPDarkenBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPDarkenBlend_HDA_o, TVPDarkenBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPDarkenBlend_HDA_o, TVPDarkenBlend_o_NEON); - REGISTER_TVPGL_BLEND_FUNC_2(TVPLightenBlend_HDA, TVPLightenBlend); - REGISTER_TVPGL_ONLY(TVPLightenBlend_HDA, TVPLightenBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPLightenBlend_HDA_o, TVPLightenBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPLightenBlend_HDA_o, TVPLightenBlend_o_NEON); - REGISTER_TVPGL_BLEND_FUNC_2(TVPScreenBlend_HDA, TVPScreenBlend); - REGISTER_TVPGL_ONLY(TVPScreenBlend_HDA, TVPScreenBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPScreenBlend_HDA_o, TVPScreenBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPScreenBlend_HDA_o, TVPScreenBlend_o_NEON); - - SHOW_AND_CLEAR_LOG; - - REGISTER_TVPGL_CUSTOM_FUNC(TVPInterpStretchCopy, TVPInterpStretchCopy, - 127 * 256, testdata1, testdata2, 127, 0, 1<<16); - -// TVPFastLinearInterpH2F, TVPFastLinearInterpH2F_c; -// TVPFastLinearInterpH2B, TVPFastLinearInterpH2B_c; - - REGISTER_TVPGL_CUSTOM_FUNC(TVPFastLinearInterpV2, TVPFastLinearInterpV2, - 256 * 256, testdata1, testdata2); - - //TVPStretchColorCopy, TVPStretchColorCopy_c; - - //REGISTER_TVPGL_LINTRANS_FUNC_2(TVPInterpLinTransCopy, TVPInterpLinTransCopy); // performance issue ! - - //TVPMakeAlphaFromKey, TVPMakeAlphaFromKey_c; - - REGISTER_TVPGL_BLEND_FUNC_2(TVPCopyMask, TVPCopyMask); - REGISTER_TVPGL_BLEND_FUNC_2(TVPCopyColor, TVPCopyColor); - REGISTER_TVPGL_CUSTOM_FUNC(TVPBindMaskToMain, TVPBindMaskToMain, testrule, 256 * 256); - - // NEON's TVPFillARGB is slower than plain C -// REGISTER_TVPGL_CUSTOM_FUNC(TVPFillARGB_c, TVPFillARGB, 256 * 256, 0x55d20688); -// REGISTER_TVPGL_ONLY(TVPFillARGB_NC, TVPFillARGB_NEON); - - SHOW_AND_CLEAR_LOG; - - REGISTER_TVPGL_CUSTOM_FUNC(TVPFillColor, TVPFillColor, 256 * 256, 0x55d20688); - REGISTER_TVPGL_CUSTOM_FUNC(TVPFillMask, TVPFillMask, 256 * 256, 0x55d20688); - REGISTER_TVPGL_CUSTOM_FUNC_TYPE(TVPAddSubVertSum16, TVPAddSubVertSum16, tjs_uint16*, testdata1, testdata2, 128 * 256); - REGISTER_TVPGL_CUSTOM_FUNC_TYPE(TVPAddSubVertSum16_d, TVPAddSubVertSum16_d, tjs_uint16*, testdata1, testdata2, 128 * 256); - -// TVPAddSubVertSum32, TVPAddSubVertSum32_c; -// TVPAddSubVertSum32_d, TVPAddSubVertSum32_d_c; -// TVPDoBoxBlurAvg16, TVPDoBoxBlurAvg16_c; -// TVPDoBoxBlurAvg16_d, TVPDoBoxBlurAvg16_d_c; -// TVPDoBoxBlurAvg32, TVPDoBoxBlurAvg32_c; -// TVPDoBoxBlurAvg32_d, TVPDoBoxBlurAvg32_d_c; -// TVPSwapLine8, TVPSwapLine8_c; -// TVPSwapLine32, TVPSwapLine32_c; -// TVPReverse8, TVPReverse8_c; -// TVPReverse32, TVPReverse32_c; - REGISTER_TVPGL_CUSTOM_FUNC(TVPDoGrayScale, TVPDoGrayScale, 256 * 256); -// TVPInitGammaAdjustTempData, TVPInitGammaAdjustTempData_c; -// TVPUninitGammaAdjustTempData, TVPUninitGammaAdjustTempData_c; -// TVPAdjustGamma, TVPAdjustGamma_c; -// TVPAdjustGamma_a, TVPAdjustGamma_a_c; -// TVPChBlurMulCopy65, TVPChBlurMulCopy65_c; -// TVPChBlurAddMulCopy65, TVPChBlurAddMulCopy65_c; -// TVPChBlurCopy65, TVPChBlurCopy65_c; -// TVPBLExpand1BitTo8BitPal, TVPBLExpand1BitTo8BitPal_c; -// TVPBLExpand1BitTo8Bit, TVPBLExpand1BitTo8Bit_c; -// TVPBLExpand1BitTo32BitPal, TVPBLExpand1BitTo32BitPal_c; -// TVPBLExpand4BitTo8BitPal, TVPBLExpand4BitTo8BitPal_c; -// TVPBLExpand4BitTo8Bit, TVPBLExpand4BitTo8Bit_c; -// TVPBLExpand4BitTo32BitPal, TVPBLExpand4BitTo32BitPal_c; -// TVPBLExpand8BitTo8BitPal, TVPBLExpand8BitTo8BitPal_c; -// TVPBLExpand8BitTo32BitPal, TVPBLExpand8BitTo32BitPal_c; - - REGISTER_TVPGL_CUSTOM_FUNC(TVPExpand8BitTo32BitGray, TVPExpand8BitTo32BitGray, testrule, 256 * 256); -// TVPBLConvert15BitTo8Bit, TVPBLConvert15BitTo8Bit; - REGISTER_TVPGL_CUSTOM_FUNC(TVPBLConvert15BitTo32Bit, TVPBLConvert15BitTo32Bit, (const tjs_uint16*)testrule, 128 * 256); -// TVPBLConvert24BitTo8Bit, TVPBLConvert24BitTo8Bit; - REGISTER_TVPGL_ONLY(TVPBLConvert24BitTo32Bit, TVPConvert24BitTo32Bit_NEON); - REGISTER_TVPGL_CUSTOM_FUNC(TVPConvert24BitTo32Bit, TVPConvert24BitTo32Bit, testrule, 256 * 256 / 3); - REGISTER_TVPGL_CUSTOM_FUNC(TVPConvert32BitTo24Bit, TVPConvert32BitTo24Bit, testrule, 256 * 256 / 3); -// TVPBLConvert32BitTo8Bit, TVPBLConvert32BitTo8Bit; -// TVPBLConvert32BitTo32Bit_NoneAlpha, TVPBLConvert32BitTo32Bit_NoneAlpha; -// TVPBLConvert32BitTo32Bit_MulAddAlpha, TVPBLConvert32BitTo32Bit_MulAddAlpha; -// TVPBLConvert32BitTo32Bit_AddAlpha, TVPBLConvert32BitTo32Bit_AddAlpha; -// TVPDither32BitTo16Bit565, TVPDither32BitTo16Bit565; -// TVPDither32BitTo16Bit555, TVPDither32BitTo16Bit555; -// TVPDither32BitTo8Bit, TVPDither32BitTo8Bit; -// TVPTLG5DecompressSlide, TVPTLG5DecompressSlide; -// TVPTLG6DecodeGolombValuesForFirst, TVPTLG6DecodeGolombValuesForFirst; -// TVPTLG6DecodeGolombValues, TVPTLG6DecodeGolombValues; - - SHOW_AND_CLEAR_LOG; - - REGISTER_TVPGL_BLEND_FUNC_2(TVPPsAlphaBlend_HDA, TVPPsAlphaBlend); - REGISTER_TVPGL_ONLY(TVPPsAlphaBlend_HDA, TVPPsAlphaBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPPsAlphaBlend_HDA_o, TVPPsAlphaBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPPsAlphaBlend_HDA_o, TVPPsAlphaBlend_o_NEON); - - REGISTER_TVPGL_BLEND_FUNC_2(TVPPsAddBlend_HDA, TVPPsAddBlend); - REGISTER_TVPGL_ONLY(TVPPsAddBlend_HDA, TVPPsAddBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPPsAddBlend_HDA_o, TVPPsAddBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPPsAddBlend_HDA_o, TVPPsAddBlend_o_NEON); - - REGISTER_TVPGL_BLEND_FUNC_2(TVPPsSubBlend_HDA, TVPPsSubBlend); - REGISTER_TVPGL_ONLY(TVPPsSubBlend_HDA, TVPPsSubBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPPsSubBlend_HDA_o, TVPPsSubBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPPsSubBlend_HDA_o, TVPPsSubBlend_o_NEON); - - REGISTER_TVPGL_BLEND_FUNC_2(TVPPsMulBlend_HDA, TVPPsMulBlend); - REGISTER_TVPGL_ONLY(TVPPsMulBlend_HDA, TVPPsMulBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPPsMulBlend_HDA_o, TVPPsMulBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPPsMulBlend_HDA_o, TVPPsMulBlend_o_NEON); - - SHOW_AND_CLEAR_LOG; - - REGISTER_TVPGL_BLEND_FUNC_2(TVPPsScreenBlend_HDA, TVPPsScreenBlend); - REGISTER_TVPGL_ONLY(TVPPsScreenBlend_HDA, TVPPsScreenBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPPsScreenBlend_HDA_o, TVPPsScreenBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPPsScreenBlend_HDA_o, TVPPsScreenBlend_o_NEON); - - REGISTER_TVPGL_BLEND_FUNC_2(TVPPsOverlayBlend_HDA, TVPPsOverlayBlend); - REGISTER_TVPGL_ONLY(TVPPsOverlayBlend_HDA, TVPPsOverlayBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPPsOverlayBlend_HDA_o, TVPPsOverlayBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPPsOverlayBlend_HDA_o, TVPPsOverlayBlend_o_NEON); - - REGISTER_TVPGL_BLEND_FUNC_2(TVPPsHardLightBlend_HDA, TVPPsHardLightBlend); - REGISTER_TVPGL_ONLY(TVPPsHardLightBlend_HDA, TVPPsHardLightBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPPsHardLightBlend_HDA_o, TVPPsHardLightBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPPsHardLightBlend_HDA_o, TVPPsHardLightBlend_o_NEON); - -// TVPPsSoftLightBlend = TVPPsSoftLightBlend_c; -// TVPPsSoftLightBlend_o = TVPPsSoftLightBlend_o_c; -// TVPPsSoftLightBlend_HDA = TVPPsSoftLightBlend_HDA_c; -// TVPPsSoftLightBlend_HDA_o = TVPPsSoftLightBlend_HDA_o_c; -// TVPPsColorDodgeBlend = TVPPsColorDodgeBlend_c; -// TVPPsColorDodgeBlend_o = TVPPsColorDodgeBlend_o_c; -// TVPPsColorDodgeBlend_HDA = TVPPsColorDodgeBlend_HDA_c; -// TVPPsColorDodgeBlend_HDA_o = TVPPsColorDodgeBlend_HDA_o_c; -// TVPPsColorDodge5Blend = TVPPsColorDodge5Blend_c; -// TVPPsColorDodge5Blend_o = TVPPsColorDodge5Blend_o_c; -// TVPPsColorDodge5Blend_HDA = TVPPsColorDodge5Blend_HDA_c; -// TVPPsColorDodge5Blend_HDA_o = TVPPsColorDodge5Blend_HDA_o_c; -// TVPPsColorBurnBlend = TVPPsColorBurnBlend_c; -// TVPPsColorBurnBlend_o = TVPPsColorBurnBlend_o_c; -// TVPPsColorBurnBlend_HDA = TVPPsColorBurnBlend_HDA_c; -// TVPPsColorBurnBlend_HDA_o = TVPPsColorBurnBlend_HDA_o_c; - - REGISTER_TVPGL_BLEND_FUNC_2(TVPPsLightenBlend_HDA, TVPPsLightenBlend); - REGISTER_TVPGL_ONLY(TVPPsLightenBlend_HDA, TVPPsLightenBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPPsLightenBlend_HDA_o, TVPPsLightenBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPPsLightenBlend_HDA_o, TVPPsLightenBlend_o_NEON); - - SHOW_AND_CLEAR_LOG; - - REGISTER_TVPGL_BLEND_FUNC_2(TVPPsDarkenBlend_HDA, TVPPsDarkenBlend); - REGISTER_TVPGL_ONLY(TVPPsDarkenBlend_HDA, TVPPsDarkenBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPPsDarkenBlend_HDA_o, TVPPsDarkenBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPPsDarkenBlend_HDA_o, TVPPsDarkenBlend_o_NEON); - - REGISTER_TVPGL_BLEND_FUNC_2(TVPPsDiffBlend_HDA, TVPPsDiffBlend); - REGISTER_TVPGL_ONLY(TVPPsDiffBlend_HDA, TVPPsDiffBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPPsDiffBlend_HDA_o, TVPPsDiffBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPPsDiffBlend_HDA_o, TVPPsDiffBlend_o_NEON); - - REGISTER_TVPGL_BLEND_FUNC_2(TVPPsDiff5Blend_HDA, TVPPsDiff5Blend); - REGISTER_TVPGL_ONLY(TVPPsDiff5Blend_HDA, TVPPsDiff5Blend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPPsDiff5Blend_HDA_o, TVPPsDiff5Blend_o, 100); - REGISTER_TVPGL_ONLY(TVPPsDiff5Blend_HDA_o, TVPPsDiff5Blend_o_NEON); - - REGISTER_TVPGL_BLEND_FUNC_2(TVPPsExclusionBlend_HDA, TVPPsExclusionBlend); - REGISTER_TVPGL_ONLY(TVPPsExclusionBlend_HDA, TVPPsExclusionBlend_NEON); - REGISTER_TVPGL_BLEND_FUNC(TVPPsExclusionBlend_HDA_o, TVPPsExclusionBlend_o, 100); - REGISTER_TVPGL_ONLY(TVPPsExclusionBlend_HDA_o, TVPPsExclusionBlend_o_NEON); - - REGISTER_TVPGL_ONLY(TVPTLG6DecodeLineGeneric, TVPTLG6DecodeLineGeneric_NEON); - REGISTER_TVPGL_ONLY(TVPTLG5ComposeColors3To4, TVPTLG5ComposeColors3To4_NEON); - REGISTER_TVPGL_ONLY(TVPTLG5ComposeColors4To4, TVPTLG5ComposeColors4To4_NEON); -// REGISTER_TVPGL_ONLY(TVPTLG5DecompressSlide, TVPTLG5DecompressSlide_NEON); // TODO test performance - - REGISTER_TVPGL_ONLY(TVPReverseRGB, TVPReverseRGB_NEON); - REGISTER_TVPGL_ONLY(TVPUpscale65_255, TVPUpscale65_255_NEON); - - SHOW_AND_CLEAR_LOG; -#endif -#ifdef DEBUG_ARM_NEON -#ifdef TEST_ARM_NEON_CODE - testTLG6_chroma(); -#endif - free(testbuff); -#ifdef _DEBUG - TVPInitTVPGL(); -#endif -#endif - } -} - -FUNC_API void TVPGL_ASM_Test() { -#ifdef LOG_NEON_TEST - TVPCPUFeatures |= TVP_CPU_FAMILY_ARM | TVP_CPU_HAS_NEON; - TVPGL_ASM_Init(); -#endif -} \ No newline at end of file diff --git a/src/core/visual/ARM/tvpgl_arm.cpp b/src/core/visual/ARM/tvpgl_arm.cpp new file mode 100644 index 00000000..1c86d3c1 --- /dev/null +++ b/src/core/visual/ARM/tvpgl_arm.cpp @@ -0,0 +1,3553 @@ +#include "tvpgl_arm_intf.h" +#include "tvpgl_asm_init.h" +#include +#include +#include +#include + +//#define TEST_ARM_NEON_CODE +//#define DEBUG_ARM_NEON +//#define LOG_NEON_TEST + +#ifdef __cplusplus +#if defined(_MSC_VER) +#include +#endif +#include +extern "C" { +#endif +extern unsigned char TVPNegativeMulTable[256*256]; +extern unsigned char TVPOpacityOnOpacityTable[256*256]; +extern unsigned short TVPRecipTableForOpacityOnOpacity[256]; +#ifdef __cplusplus +}; +#endif + +#define __CAT_NAME(a, b) a##b +#define _CAT_NAME(a, b) __CAT_NAME(a, b) + +template +void do_blend_(LoadFuncS lfuncs, LoadFuncD lfuncd, StoreFunc sfunc, CFunc c_func, OPFunc op_func, TDst *dest, const TSrc *src, tjs_int len, TArg... args) { + TDst* pEndDst = dest + len; + { + tjs_int PreFragLen = ((-(((tjs_int)(intptr_t)dest) & (elmcount - 1))) & (elmcount - 1)) / sizeof(*dest); + if (PreFragLen > len) PreFragLen = len; + if (PreFragLen) { + c_func(dest, src, PreFragLen, args...); + dest += PreFragLen; + src += PreFragLen; + } + } + + TDst* pVecEndDst = pEndDst - elmcount + 1; + if ((intptr_t)src & (elmcount - 1)) { + while (dest < pVecEndDst) { + TElmSrc s = lfuncs((uint8_t *)src); + TElmDst d = lfuncd((uint8_t *)__builtin_assume_aligned(dest, elmcount)); + d = op_func(s, d, args...); + sfunc((uint8_t *)__builtin_assume_aligned(dest, elmcount), d); + dest += elmcount; + src += elmcount; + } + } else { + while (dest < pVecEndDst) { + TElmSrc s = lfuncs((uint8_t *)__builtin_assume_aligned(src, elmcount)); + TElmDst d = lfuncd((uint8_t *)__builtin_assume_aligned(dest, elmcount)); + d = op_func(s, d, args...); + sfunc((uint8_t *)__builtin_assume_aligned(dest, elmcount), d); + dest += elmcount; + src += elmcount; + } + } + + if (dest < pEndDst) { + c_func(dest, src, pEndDst - dest, args...); + } +} + +static uint8x8_t __vld1_u8(uint8_t* p) { return vld1_u8(p); } +static uint8x8x4_t __vld4_u8(uint8_t* p) { return vld4_u8(p); } +static void __vst4_u8(uint8_t* p, uint8x8x4_t v) { return vst4_u8(p, v); } +template +void do_blend(CFunc c_func, OPFunc op_func, tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, TArg... args) { + do_blend_<8, uint8x8x4_t, uint8x8x4_t>(__vld4_u8, __vld4_u8, __vst4_u8, c_func, op_func, dest, src, len, args...); +} +static uint8x16x4_t __vld4q_u8(uint8_t* p) { + __builtin_prefetch(p, 0, 0); + return vld4q_u8(p); +} +static void __vst4q_u8(uint8_t* p, uint8x16x4_t v) { return vst4q_u8(p, v); } +template +void do_blend_128(CFunc c_func, OPFunc op_func, tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, TArg... args) { + do_blend_<16, uint8x16x4_t, uint8x16x4_t>(__vld4q_u8, __vld4q_u8, __vst4q_u8, c_func, op_func, dest, src, len, args...); +} + +template +void do_blend_lum(CFunc c_func, OPFunc op_func, tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len, TArg... args) { + do_blend_<8, uint8x8x4_t, uint8x8_t>(__vld1_u8, __vld4_u8, __vst4_u8, c_func, op_func, dest, src, len, args...); +} + +template +void do_stretch_blend(CFunc c_func, OPFunc op_func, tjs_uint32 *dest, tjs_int len, + const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep, TArg... args) { + tjs_uint32* pEndDst = dest + len; + { + tjs_int PreFragLen = ((-(((tjs_int)(intptr_t)dest) & 7)) & 7) / sizeof(*dest); + if (PreFragLen > len) PreFragLen = len; + if (PreFragLen) { + c_func(dest, PreFragLen, src, srcstart, srcstep, args...); + dest += PreFragLen; + srcstart += PreFragLen * srcstep; + } + } + uint8_t strechbuff[32 + 16]; + tjs_uint32 *tmp = (tjs_uint32*)((((intptr_t)strechbuff) + 7) & ~7); + tjs_uint32* pVecEndDst = pEndDst - 7; + while (dest < pVecEndDst) { + for (int i = 0; i < 8; ++i) { + tmp[i] = src[(srcstart) >> 16]; + srcstart += srcstep; + } + uint8x8x4_t s = vld4_u8((uint8_t *)__builtin_assume_aligned(tmp, 8)); + uint8x8x4_t d = vld4_u8((uint8_t *)__builtin_assume_aligned(dest, 8)); + d = op_func(s, d, args...); + vst4_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d); + dest += 8; + } + if (dest < pEndDst) { + c_func(dest, pEndDst - dest, src, srcstart, srcstep, args...); + } +} + +template +void do_lintrans_blend(CFunc c_func, OPFunc op_func, tjs_uint32 *dest, tjs_int len, + const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch, TArg... args) { + tjs_uint32* pEndDst = dest + len; + { + tjs_int PreFragLen = ((-(((tjs_int)(intptr_t)dest) & 7)) & 7) / sizeof(*dest); + if (PreFragLen > len) PreFragLen = len; + if (PreFragLen) { + c_func(dest, PreFragLen, src, sx, sy, stepx, stepy, srcpitch, args...); + dest += PreFragLen; + sx += stepx * PreFragLen; + sy += stepy * PreFragLen; + } + } + uint8_t strechbuff[32 + 16]; + tjs_uint32 *tmp = (tjs_uint32*)((((intptr_t)strechbuff) + 7) & ~7); + tjs_uint32* pVecEndDst = pEndDst - 7; + while (dest < pVecEndDst) { + for (int i = 0; i < 8; ++i) { + tmp[i] = *((const tjs_uint32*)((const tjs_uint8*)src + ((sy) >> 16)*srcpitch) + ((sx) >> 16)); + sx += stepx; + sy += stepy; + } + uint8x8x4_t s = vld4_u8((uint8_t *)__builtin_assume_aligned(tmp, 8)); + uint8x8x4_t d = vld4_u8((uint8_t *)__builtin_assume_aligned(dest, 8)); + d = op_func(s, d, args...); + vst4_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d); + dest += 8; + } + if (dest < pEndDst) { + c_func(dest, pEndDst - dest, src, sx, sy, stepx, stepy, srcpitch, args...); + } +} + +template +void do_interp_stretch_blend(CFunc c_func, OPFunc op_func, tjs_uint32 *dest, tjs_int len, + const tjs_uint32 *src1, const tjs_uint32 *src2, tjs_int _blend_y, tjs_int srcstart, tjs_int srcstep, TArg... args) { + tjs_uint32* pEndDst = dest + len; + { + tjs_int PreFragLen = ((-(((tjs_int)(intptr_t)dest) & 7)) & 7) / sizeof(*dest); + if (PreFragLen > len) PreFragLen = len; + if (PreFragLen) { + c_func(dest, PreFragLen, src1, src2, _blend_y, srcstart, srcstep, args...); + dest += PreFragLen; + srcstart += PreFragLen * srcstep; + } + } + tjs_int blend_y = _blend_y + (_blend_y >> 7); /* adjust blend ratio */ + + uint8_t tmpbuff[4 * 8 * 3 + 16]; + tjs_uint32 *tmp1_0 = (tjs_uint32*)((((intptr_t)tmpbuff) + 15) & ~15); + tjs_uint32 *tmp1_1 = tmp1_0 + 8; + uint16_t *blend_x = (uint16_t *)__builtin_assume_aligned(tmp1_1 + 8, 8); + tjs_uint32* pVecEndDst = pEndDst - 7; + while (dest < pVecEndDst) { + tjs_int start = srcstart; + for (int i = 0; i < 8; ++i) { + int addr = start >> 16; + tmp1_0[i] = src2[addr]; + tmp1_1[i] = src2[addr + 1]; + blend_x[i] = (start & 0xffff) >> 8; + start += srcstep; + } + // TVPBlendARGB(src2[sp], src2[sp+1], blend_x) + uint8x8x4_t b = vld4_u8((uint8_t *)__builtin_assume_aligned(tmp1_0, 8)); + uint8x8x4_t a = vld4_u8((uint8_t *)__builtin_assume_aligned(tmp1_1, 8)); + uint16x8_t ratio = vld1q_u16(blend_x); // qreg = 5 + // TVPBlendARGB: a * ratio + b * (1 - ratio) => b + (a - b) * ratio + uint16x8_t s_a16 = vmulq_u16(vsubl_u8(a.val[3], b.val[3]), ratio); + uint16x8_t s_r16 = vmulq_u16(vsubl_u8(a.val[2], b.val[2]), ratio); + uint16x8_t s_g16 = vmulq_u16(vsubl_u8(a.val[1], b.val[1]), ratio); + uint16x8_t s_b16 = vmulq_u16(vsubl_u8(a.val[0], b.val[0]), ratio); // qreg = 9 + + uint8x8x4_t s_argb8; + s_argb8.val[3] = vadd_u8(b.val[3], vshrn_n_u16(s_a16, 8)); + s_argb8.val[2] = vadd_u8(b.val[2], vshrn_n_u16(s_r16, 8)); + s_argb8.val[1] = vadd_u8(b.val[1], vshrn_n_u16(s_g16, 8)); + s_argb8.val[0] = vadd_u8(b.val[0], vshrn_n_u16(s_b16, 8)); // qreg = 11 + + start = srcstart; + for (int i = 0; i < 8; ++i) { + int addr = (start) >> 16; + tmp1_0[i] = src1[addr]; + tmp1_1[i] = src1[addr + 1]; + start += srcstep; + } + // TVPBlendARGB(src1[sp], src1[sp+1], blend_x) + b = vld4_u8((uint8_t *)__builtin_assume_aligned(tmp1_0, 8)); + a = vld4_u8((uint8_t *)__builtin_assume_aligned(tmp1_1, 8)); + s_a16 = vmulq_u16(vsubl_u8(a.val[3], b.val[3]), ratio); + s_r16 = vmulq_u16(vsubl_u8(a.val[2], b.val[2]), ratio); + s_g16 = vmulq_u16(vsubl_u8(a.val[1], b.val[1]), ratio); + s_b16 = vmulq_u16(vsubl_u8(a.val[0], b.val[0]), ratio); + uint8x8x4_t s2; + s2.val[3] = vadd_u8(b.val[3], vshrn_n_u16(s_a16, 8)); + s2.val[2] = vadd_u8(b.val[2], vshrn_n_u16(s_r16, 8)); + s2.val[1] = vadd_u8(b.val[1], vshrn_n_u16(s_g16, 8)); + s2.val[0] = vadd_u8(b.val[0], vshrn_n_u16(s_b16, 8)); // qreg = 13 + s_a16 = vmulq_n_u16(vsubl_u8(s_argb8.val[3], s2.val[3]), blend_y); + s_r16 = vmulq_n_u16(vsubl_u8(s_argb8.val[2], s2.val[2]), blend_y); + s_g16 = vmulq_n_u16(vsubl_u8(s_argb8.val[1], s2.val[1]), blend_y); + s_b16 = vmulq_n_u16(vsubl_u8(s_argb8.val[0], s2.val[0]), blend_y); + s_argb8.val[3] = vadd_u8(s2.val[3], vshrn_n_u16(s_a16, 8)); + s_argb8.val[2] = vadd_u8(s2.val[2], vshrn_n_u16(s_r16, 8)); + s_argb8.val[1] = vadd_u8(s2.val[1], vshrn_n_u16(s_g16, 8)); + s_argb8.val[0] = vadd_u8(s2.val[0], vshrn_n_u16(s_b16, 8)); + uint8x8x4_t d_argb8 = vld4_u8((uint8_t *)__builtin_assume_aligned(dest, 8)); + d_argb8 = op_func(s_argb8, d_argb8, args...); + vst4_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d_argb8); + srcstart = start; + dest += 8; + } + if (dest < pEndDst) { + c_func(dest, pEndDst - dest, src1, src2, _blend_y, srcstart, srcstep, args...); + } +} + +template +void do_interp_lintrans_blend(CFunc c_func, OPFunc op_func, tjs_uint32 *dest, tjs_int len, + const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch, TArg... args) { + tjs_uint32* pEndDst = dest + len; + { + tjs_int PreFragLen = ((-(((tjs_int)(intptr_t)dest) & 7)) & 7) / sizeof(*dest); + if (PreFragLen > len) PreFragLen = len; + if (PreFragLen) { + c_func(dest, PreFragLen, src, sx, sy, stepx, stepy, srcpitch, args...); + dest += PreFragLen; + sx += stepx * PreFragLen; + sy += stepy * PreFragLen; + } + } + uint8_t tmpbuff[4 * 8 * 4 + 2 * 8 * 2 + 16]; + tjs_uint32 *tmp0_0 = (tjs_uint32*)((((intptr_t)tmpbuff) + 15) & ~15); + tjs_uint32 *tmp0_1 = tmp0_0 + 8; + tjs_uint32 *tmp1_0 = tmp0_1 + 8; + tjs_uint32 *tmp1_1 = tmp1_0 + 8; + uint16_t *blend_x = (uint16_t *)__builtin_assume_aligned(tmp1_1 + 8, 8); + uint16_t *blend_y = (uint16_t *)__builtin_assume_aligned(blend_x + 8, 8); + tjs_uint32* pVecEndDst = pEndDst - 7; + while (dest < pVecEndDst) { + for (int i = 0; i < 8; ++i) { + const tjs_uint32 *p0, *p1; + int bld_x, bld_y; + bld_x = (sx & 0xffff) >> 8; + bld_x += bld_x >> 7; + bld_y = (sy & 0xffff) >> 8; + bld_y += bld_y >> 7; + blend_x[i] = bld_x; + blend_y[i] = bld_y; + + p0 = (const tjs_uint32*)((const tjs_uint8*)src + ((sy >> 16))*srcpitch) + (sx >> 16); + p1 = (const tjs_uint32*)((const tjs_uint8*)p0 + srcpitch); + + tmp0_0[i] = p0[0]; + tmp0_1[i] = p0[1]; + tmp1_0[i] = p1[0]; + tmp1_1[i] = p1[1]; + + sx += stepx; + sy += stepy; + } + // TVPBlendARGB(src2[sp], src2[sp+1], blend_x) + uint8x8x4_t b = vld4_u8((uint8_t *)__builtin_assume_aligned(tmp1_0, 8)); + uint8x8x4_t a = vld4_u8((uint8_t *)__builtin_assume_aligned(tmp1_1, 8)); + uint16x8_t ratio = vld1q_u16(blend_x); // qreg = 5 + // TVPBlendARGB: a * ratio + b * (1 - ratio) => b + (a - b) * ratio + uint16x8_t s_a16 = vmulq_u16(vsubl_u8(a.val[3], b.val[3]), ratio); + uint16x8_t s_r16 = vmulq_u16(vsubl_u8(a.val[2], b.val[2]), ratio); + uint16x8_t s_g16 = vmulq_u16(vsubl_u8(a.val[1], b.val[1]), ratio); + uint16x8_t s_b16 = vmulq_u16(vsubl_u8(a.val[0], b.val[0]), ratio); // qreg = 9 + + uint8x8x4_t s_argb8; + s_argb8.val[3] = vadd_u8(b.val[3], vshrn_n_u16(s_a16, 8)); + s_argb8.val[2] = vadd_u8(b.val[2], vshrn_n_u16(s_r16, 8)); + s_argb8.val[1] = vadd_u8(b.val[1], vshrn_n_u16(s_g16, 8)); + s_argb8.val[0] = vadd_u8(b.val[0], vshrn_n_u16(s_b16, 8)); // qreg = 11 + + b = vld4_u8((uint8_t *)__builtin_assume_aligned(tmp0_0, 8)); + a = vld4_u8((uint8_t *)__builtin_assume_aligned(tmp0_1, 8)); + s_a16 = vmulq_u16(vsubl_u8(a.val[3], b.val[3]), ratio); + s_r16 = vmulq_u16(vsubl_u8(a.val[2], b.val[2]), ratio); + s_g16 = vmulq_u16(vsubl_u8(a.val[1], b.val[1]), ratio); + s_b16 = vmulq_u16(vsubl_u8(a.val[0], b.val[0]), ratio); + uint8x8x4_t s2; + s2.val[3] = vadd_u8(b.val[3], vshrn_n_u16(s_a16, 8)); + s2.val[2] = vadd_u8(b.val[2], vshrn_n_u16(s_r16, 8)); + s2.val[1] = vadd_u8(b.val[1], vshrn_n_u16(s_g16, 8)); + s2.val[0] = vadd_u8(b.val[0], vshrn_n_u16(s_b16, 8)); // qreg = 13 + + ratio = vld1q_u16(blend_y); + s_a16 = vmulq_u16(vsubl_u8(s_argb8.val[3], s2.val[3]), ratio); + s_r16 = vmulq_u16(vsubl_u8(s_argb8.val[2], s2.val[2]), ratio); + s_g16 = vmulq_u16(vsubl_u8(s_argb8.val[1], s2.val[1]), ratio); + s_b16 = vmulq_u16(vsubl_u8(s_argb8.val[0], s2.val[0]), ratio); + s_argb8.val[3] = vadd_u8(s2.val[3], vshrn_n_u16(s_a16, 8)); + s_argb8.val[2] = vadd_u8(s2.val[2], vshrn_n_u16(s_r16, 8)); + s_argb8.val[1] = vadd_u8(s2.val[1], vshrn_n_u16(s_g16, 8)); + s_argb8.val[0] = vadd_u8(s2.val[0], vshrn_n_u16(s_b16, 8)); + uint8x8x4_t d_argb8 = vld4_u8((uint8_t *)__builtin_assume_aligned(dest, 8)); + d_argb8 = op_func(s_argb8, d_argb8, args...); + vst4_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d_argb8); + dest += 8; + } + if (dest < pEndDst) { + c_func(dest, pEndDst - dest, src, sx, sy, stepx, stepy, srcpitch, args...); + } +} + +template +void do_apply_pixel_(LoadFunc lfunc, StoreFunc sfunc, CFunc c_func, OPFunc op_func, tjs_uint32 *dest, tjs_int len, TArg... args) { + tjs_uint32* pEndDst = dest + len; + { + tjs_int PreFragLen = ((-(((tjs_int)(intptr_t)dest) & 7)) & 7) / sizeof(*dest); + if (PreFragLen > len) PreFragLen = len; + if (PreFragLen) { + c_func(dest, PreFragLen, args...); + dest += PreFragLen; + } + } + tjs_uint32* pVecEndDst = pEndDst - elmcount + 1; + while (dest < pVecEndDst) { + TElm d = lfunc((uint8_t *)__builtin_assume_aligned(dest, 8)); + d = op_func(d, args...); + sfunc((uint8_t *)__builtin_assume_aligned(dest, 8), d); + dest += 8; + } + + if (dest < pEndDst) { + c_func(dest, pEndDst - dest, args...); + } +} +template +void do_apply_pixel(CFunc c_func, OPFunc op_func, tjs_uint32 *dest, tjs_int len, TArg... args) { + do_apply_pixel_(__vld4_u8, __vst4_u8, c_func, op_func, dest, len, args...); +} + +template +void do_blend_2(CFunc c_func, OPFunc op_func, tjs_uint32 *dest, const tjs_uint32 *src1, const tjs_uint32 *src2, tjs_int len, TArg... args) { + tjs_uint32* pEndDst = dest + len; + { + tjs_int PreFragLen = ((-(((tjs_int)(intptr_t)dest) & 7)) & 7) / sizeof(*dest); + if (PreFragLen > len) PreFragLen = len; + if (PreFragLen) { + c_func(dest, src1, src2, PreFragLen, args...); + dest += PreFragLen; + src1 += PreFragLen; + src2 += PreFragLen; + } + } + + tjs_uint32* pVecEndDst = pEndDst - 7; + if (((intptr_t)src1 & 7) || ((intptr_t)src2 & 7)) { + while (dest < pVecEndDst) { + uint8x8x4_t s1 = vld4_u8((uint8_t *)__builtin_assume_aligned(src1, 4)); + uint8x8x4_t s2 = vld4_u8((uint8_t *)__builtin_assume_aligned(src2, 4)); + uint8x8x4_t d = op_func(s2, s1, args...); + vst4_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d); + dest += 8; + src1 += 8; + src2 += 8; + } + } else { + while (dest < pVecEndDst) { + uint8x8x4_t s1 = vld4_u8((uint8_t *)__builtin_assume_aligned(src1, 8)); + uint8x8x4_t s2 = vld4_u8((uint8_t *)__builtin_assume_aligned(src2, 8)); + uint8x8x4_t d = op_func(s2, s1, args...); + vst4_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d); + dest += 8; + src1 += 8; + src2 += 8; + } + } + + if (dest < pEndDst) { + c_func(dest, src1, src2, pEndDst - dest, args...); + } +} + +template +void do_univ_blend(CFunc c_func, OPFunc op_func, tjs_uint32 *dest, const tjs_uint32 *src1, const tjs_uint32 *src2, const tjs_uint8 *rule, const tjs_uint32 *table, tjs_int len, TArg... args) { + tjs_uint32* pEndDst = dest + len; + tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; + if (PreFragLen > len) PreFragLen = len; + if (PreFragLen) { + c_func(dest, src1, src2, rule, table, PreFragLen, args...); + dest += PreFragLen; + src1 += PreFragLen; + src2 += PreFragLen; + rule += PreFragLen; + } + tjs_uint32* pVecEndDst = pEndDst - 7; + if (((intptr_t)src1 & 7) || ((intptr_t)src2 & 7)) { + while (dest < pVecEndDst) { + uint8x8_t opa; + opa = vset_lane_u8(table[*rule++], opa, 0); + opa = vset_lane_u8(table[*rule++], opa, 1); + opa = vset_lane_u8(table[*rule++], opa, 2); + opa = vset_lane_u8(table[*rule++], opa, 3); + opa = vset_lane_u8(table[*rule++], opa, 4); + opa = vset_lane_u8(table[*rule++], opa, 5); + opa = vset_lane_u8(table[*rule++], opa, 6); + opa = vset_lane_u8(table[*rule++], opa, 7); + uint8x8x4_t s1 = vld4_u8((uint8_t *)__builtin_assume_aligned(src1, 4)); + uint8x8x4_t s2 = vld4_u8((uint8_t *)__builtin_assume_aligned(src2, 4)); + uint8x8x4_t d = op_func(s2, s1, opa); + vst4_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d); + src1 += 8; + src2 += 8; + dest += 8; + } + } else { + while (dest < pVecEndDst) { + uint8x8_t opa; + opa = vset_lane_u8(table[*rule++], opa, 0); + opa = vset_lane_u8(table[*rule++], opa, 1); + opa = vset_lane_u8(table[*rule++], opa, 2); + opa = vset_lane_u8(table[*rule++], opa, 3); + opa = vset_lane_u8(table[*rule++], opa, 4); + opa = vset_lane_u8(table[*rule++], opa, 5); + opa = vset_lane_u8(table[*rule++], opa, 6); + opa = vset_lane_u8(table[*rule++], opa, 7); + uint8x8x4_t s1 = vld4_u8((uint8_t *)__builtin_assume_aligned(src1, 8)); + uint8x8x4_t s2 = vld4_u8((uint8_t *)__builtin_assume_aligned(src2, 8)); + uint8x8x4_t d = op_func(s2, s1, opa); + vst4_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d); + src1 += 8; + src2 += 8; + dest += 8; + } + } + if (dest < pEndDst) { + c_func(dest, src1, src2, rule, table, pEndDst - dest); + } +} + +template +void do_univ_switch(CFunc c_func, OPFunc op_func, tjs_uint32 *dest, const tjs_uint32 *src1, const tjs_uint32 *src2, const tjs_uint8 *rule, const tjs_uint32 *table, tjs_int len, tjs_int src1lv, tjs_int src2lv, TArg... args) { + tjs_uint32* pEndDst = dest + len; + tjs_int PreFragLen = (tjs_uint32*)((((intptr_t)dest) + 7)&~7) - dest; + if (PreFragLen > len) PreFragLen = len; + if (PreFragLen) { + c_func(dest, src1, src2, rule, table, PreFragLen, src1lv, src2lv, args...); + dest += PreFragLen; + src1 += PreFragLen; + src2 += PreFragLen; + rule += PreFragLen; + } + tjs_uint32* pVecEndDst = pEndDst - 7; + if (((intptr_t)src1 & 7) || ((intptr_t)src2 & 7)) { + while (dest < pVecEndDst) { + uint8x8_t opa; tjs_int o; +#define SET_LANE(i) \ + o = *rule++; if (o >= src1lv) { o = 0; } else if (o < src2lv) { o = 255; } else { o = table[o]; } opa = vset_lane_u8(o, opa, i) + SET_LANE(0); SET_LANE(1); SET_LANE(2); SET_LANE(3); SET_LANE(4); SET_LANE(5); SET_LANE(6); SET_LANE(7); + uint8x8x4_t s1 = vld4_u8((uint8_t *)__builtin_assume_aligned(src1, 4)); + uint8x8x4_t s2 = vld4_u8((uint8_t *)__builtin_assume_aligned(src2, 4)); + uint8x8x4_t d = op_func(s2, s1, opa); + vst4_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d); + src1 += 8; + src2 += 8; + dest += 8; + } + } else { + while (dest < pVecEndDst) { + uint8x8_t opa; tjs_int o; + SET_LANE(0); SET_LANE(1); SET_LANE(2); SET_LANE(3); SET_LANE(4); SET_LANE(5); SET_LANE(6); SET_LANE(7); +#undef SET_LANE + uint8x8x4_t s1 = vld4_u8((uint8_t *)__builtin_assume_aligned(src1, 8)); + uint8x8x4_t s2 = vld4_u8((uint8_t *)__builtin_assume_aligned(src2, 8)); + uint8x8x4_t d = op_func(s2, s1, opa); + vst4_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d); + src1 += 8; + src2 += 8; + dest += 8; + } + } + if (dest < pEndDst) { + c_func(dest, src1, src2, rule, table, pEndDst - dest, src1lv, src2lv, args...); + } +} + +template +uint8x8x4_t do_SrcAlphaBranch(uint8x8x4_t s, uint8x8x4_t d, TFunc func, TArg... args) { + uint64_t a = vget_lane_u64(vreinterpret_u64_u8(s.val[3]), 0); + if (!a) { + return d; + } else if (!~a) { + return s; + } + return func(s, d, args...); +} +template +uint8x8x4_t do_SrcAddAlphaBranch(uint8x8x4_t s, uint8x8x4_t d, TFunc func, TArg... args) { + uint64_t a = vget_lane_u64(vreinterpret_u64_u8(s.val[3]), 0); + if (!a) { + return d; + } + return func(s, d, args...); +} +static uint8x8x4_t do_copy_src(uint8x8x4_t s, uint8x8x4_t d) { + return s; +} + +#ifndef Region_AlphaBlend +static uint8x8x4_t do_AlphaBlend(uint8x8x4_t s, uint8x8x4_t d) { + // d + s * a - d * a + d.val[0] = vadd_u8(d.val[0], vsubhn_u16(vmull_u8(s.val[0], s.val[3]), vmull_u8(d.val[0], s.val[3]))); + d.val[1] = vadd_u8(d.val[1], vsubhn_u16(vmull_u8(s.val[1], s.val[3]), vmull_u8(d.val[1], s.val[3]))); + d.val[2] = vadd_u8(d.val[2], vsubhn_u16(vmull_u8(s.val[2], s.val[3]), vmull_u8(d.val[2], s.val[3]))); + return d; +} +static uint8x8x4_t do_AlphaBlend_o(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + s.val[3] = vshrn_n_u16(vmull_u8(s.val[3], vdup_n_u8(opa)), 8); + return do_AlphaBlend(s, d); +} +static uint8x8x4_t do_AlphaBlend_d_(uint8x8x4_t s, uint8x8x4_t d, uint16x8_t sopa) { + uint8_t tmpbuff[32 + 16]; + uint16_t *tmpsa = (uint16_t*)((((intptr_t)tmpbuff) + 15) & ~15); + vst1q_u16((uint16_t *)__builtin_assume_aligned(tmpsa, 16), sopa); + s.val[3] = vset_lane_u8(TVPOpacityOnOpacityTable[tmpsa[0]], s.val[3], 0); + s.val[3] = vset_lane_u8(TVPOpacityOnOpacityTable[tmpsa[1]], s.val[3], 1); + s.val[3] = vset_lane_u8(TVPOpacityOnOpacityTable[tmpsa[2]], s.val[3], 2); + s.val[3] = vset_lane_u8(TVPOpacityOnOpacityTable[tmpsa[3]], s.val[3], 3); + s.val[3] = vset_lane_u8(TVPOpacityOnOpacityTable[tmpsa[4]], s.val[3], 4); + s.val[3] = vset_lane_u8(TVPOpacityOnOpacityTable[tmpsa[5]], s.val[3], 5); + s.val[3] = vset_lane_u8(TVPOpacityOnOpacityTable[tmpsa[6]], s.val[3], 6); + s.val[3] = vset_lane_u8(TVPOpacityOnOpacityTable[tmpsa[7]], s.val[3], 7); + return do_AlphaBlend(s, d); +} +static uint8x8x4_t do_AlphaBlend_d(uint8x8x4_t s, uint8x8x4_t d) { + //( 255 - (255-a)*(255-b)/ 255 ); + uint16x8_t isd_a16 = vmull_u8(vmvn_u8(s.val[3]), vmvn_u8(d.val[3])); + uint16x8_t sopa = vorrq_u16(vshll_n_u8(s.val[3], 8), vmovl_u8(d.val[3])); + d.val[3] = vmvn_u8(vshrn_n_u16(isd_a16, 8)); + return do_AlphaBlend_d_(s, d, sopa); +} +static uint8x8x4_t do_AlphaBlend_do(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + uint8x8_t opa8 = vdup_n_u8(opa); + uint16x8_t s_a16 = vmull_u8(s.val[3], opa8); + s.val[3] = vshrn_n_u16(s_a16, 8); + return do_AlphaBlend_d(s, d); +} +static uint8x8x4_t do_AddAlphaBlend(uint8x8x4_t s, uint8x8x4_t d) { + s.val[3] = vmvn_u8(s.val[3]); + // s + d * (1 - sa) + uint16x8_t d_r16 = vmull_u8(d.val[2], s.val[3]); + uint16x8_t d_g16 = vmull_u8(d.val[1], s.val[3]); + uint16x8_t d_b16 = vmull_u8(d.val[0], s.val[3]); + + // 8-bit to do saturated add + d.val[2] = vqadd_u8(vshrn_n_u16(d_r16, 8), s.val[2]); + d.val[1] = vqadd_u8(vshrn_n_u16(d_g16, 8), s.val[1]); + d.val[0] = vqadd_u8(vshrn_n_u16(d_b16, 8), s.val[0]); + + return d; +} +static uint8x8x4_t do_ConvertAlphaToAdditiveAlpha(uint8x8x4_t s) { + s.val[2] = vshrn_n_u16(vmull_u8(s.val[2], s.val[3]), 8); + s.val[1] = vshrn_n_u16(vmull_u8(s.val[1], s.val[3]), 8); + s.val[0] = vshrn_n_u16(vmull_u8(s.val[0], s.val[3]), 8); + return s; +} +static uint8x8x4_t do_AlphaBlend_da(uint8x8x4_t s, uint8x8x4_t d) { + //Da = Sa + Da - SaDa = Sa + (1 - Sa)Da + uint16x8_t a16 = vmull_u8(vmvn_u8(s.val[3]), d.val[3]); + uint8x8_t tmp = vshrn_n_u16(a16, 8); + d.val[3] = vadd_u8(s.val[3], tmp); + return d; +} +static uint8x8x4_t do_AlphaBlend_a(uint8x8x4_t s, uint8x8x4_t d) { + d = do_AlphaBlend_da(s, d); + s = do_ConvertAlphaToAdditiveAlpha(s); + return do_AddAlphaBlend(s, d); +} +static uint8x8x4_t do_AlphaBlend_ao(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + uint8x8_t opa8 = vdup_n_u8(opa); + uint16x8_t s_a16 = vmull_u8(s.val[3], opa8); + s.val[3] = vshrn_n_u16(s_a16, 8); + return do_AlphaBlend_a(s, d); +} +static uint8x8x4_t do_ConstAlphaBlend(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + s.val[3] = vdup_n_u8(opa); + return do_AlphaBlend(s, d); +} +static uint8x8x4_t do_ConstAlphaBlend_d(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + s.val[3] = vdup_n_u8(opa); + return do_AlphaBlend_d(s, d); +} +static uint8x8x4_t do_ConstAlphaBlend_a(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + s.val[3] = vdup_n_u8(opa); + d = do_AlphaBlend_da(s, d); + return do_AddAlphaBlend(s, d); +} +static uint8x8x4_t do_ConstAlphaBlend_SD(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + d.val[3] = vadd_u8(d.val[3], vsubhn_u16(vmull_u8(s.val[3], vdup_n_u8(opa)), vmull_u8(d.val[3], vdup_n_u8(opa)))); + return do_ConstAlphaBlend(s, d, opa); +} +static uint8x8x4_t do_ConstAlphaBlend_SD_d(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + uint8x8_t opa8 = vdup_n_u8(opa); + uint16x8_t sa = vmull_u8(s.val[3], opa8); + uint8x8_t a = vadd_u8(d.val[3], vsubhn_u16(sa, vmull_u8(d.val[3], opa8))); + s.val[3] = vshrn_n_u16(sa, 8); + d.val[3] = vshrn_n_u16(vmull_u8(d.val[3], vdup_n_u8(~opa)), 8); + d = do_AlphaBlend_d(s, d); + d.val[3] = a; + return d; +} +static uint8x8x4_t do_ConstAlphaBlend_SD_a(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + d.val[3] = vadd_u8(d.val[3], vsubhn_u16(vmull_u8(s.val[3], vdup_n_u8(opa)), vmull_u8(d.val[3], vdup_n_u8(opa)))); + return do_ConstAlphaBlend(s, d, opa); +} +#endif + +static uint8x8x4_t do_AlphaColorMat(uint8x8x4_t s, tjs_uint32 color) { + uint8x8x4_t d; + d.val[0] = vdup_n_u8(color & 0xff); + d.val[1] = vdup_n_u8((color >> 8) & 0xff); + d.val[2] = vdup_n_u8((color >> 16) & 0xff); + d.val[3] = vdup_n_u8(0xff); + return do_AlphaBlend(s, d); +} +static void TVPAlphaColorMat_frag(tjs_uint32 *dest, tjs_int len, const tjs_uint32 color) { + TVPAlphaColorMat_c(dest, color, len); +} +static void TVPAlphaColorMat_NEON(tjs_uint32 *dest, const tjs_uint32 color, tjs_int len) { + do_apply_pixel(TVPAlphaColorMat_frag, do_AlphaColorMat, dest, len, color); +} + +#ifndef Region_AdditiveAlphaBlend + +static uint8x8x4_t do_AddAlphaBlendSrc(uint8x8x4_t s, tjs_int opa) { + uint8x8_t opa8 = vdup_n_u8(opa); + uint16x8_t s_a16 = vmull_u8(s.val[3], opa8); + uint16x8_t s_r16 = vmull_u8(s.val[2], opa8); + uint16x8_t s_g16 = vmull_u8(s.val[1], opa8); + uint16x8_t s_b16 = vmull_u8(s.val[0], opa8); + s.val[3] = vshrn_n_u16(s_a16, 8); + s.val[2] = vshrn_n_u16(s_r16, 8); + s.val[1] = vshrn_n_u16(s_g16, 8); + s.val[0] = vshrn_n_u16(s_b16, 8); + return s; +} +static uint8x8x4_t do_AddAlphaBlend_o(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + s = do_AddAlphaBlendSrc(s, opa); + return do_AddAlphaBlend(s, d); +} +static uint8x8x4_t do_AddAlphaBlend_a(uint8x8x4_t s, uint8x8x4_t d) { + //Da = Sa + Da - SaDa + uint16x8_t d_a16 = vmull_u8(s.val[3], d.val[3]); + uint16x8_t t = vaddl_u8(s.val[3], d.val[3]); + d_a16 = vsubq_u16(t, vshrq_n_u16(d_a16, 8)); + d.val[3] = vmovn_u16(vsubq_u16(d_a16, vshrq_n_u16(d_a16, 8))); + return do_AddAlphaBlend(s, d); +} +static uint8x8x4_t do_AddAlphaBlend_ao(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + s = do_AddAlphaBlendSrc(s, opa); + return do_AddAlphaBlend_a(s, d); +} +#endif + +static uint8x8x4_t do_AlphaBlend_branch(uint8x8x4_t s, uint8x8x4_t d) { + return do_SrcAlphaBranch(s, d, do_AlphaBlend); +} +static uint8x8x4_t do_AlphaBlend_a_branch(uint8x8x4_t s, uint8x8x4_t d) { + return do_SrcAddAlphaBranch(s, d, do_AlphaBlend_a); +} +static uint8x8x4_t do_AlphaBlend_d_branch(uint8x8x4_t s, uint8x8x4_t d) { + return do_SrcAlphaBranch(s, d, do_AlphaBlend_d); +} +static void TVPAlphaBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPAlphaBlend_HDA_c, do_AlphaBlend_branch, dest, src, len); +} +static void TVPAlphaBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPAlphaBlend_HDA_o_c, do_AlphaBlend_o, dest, src, len, opa); +} +static void TVPAlphaBlend_a_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPAlphaBlend_a_c, do_AlphaBlend_a_branch, dest, src, len); +} +static void TVPAlphaBlend_ao_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPAlphaBlend_ao_c, do_AlphaBlend_ao, dest, src, len, opa); +} +static void TVPAlphaBlend_d_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPAlphaBlend_d_c, do_AlphaBlend_d_branch, dest, src, len); +} +static void TVPAlphaBlend_do_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPAlphaBlend_do_c, do_AlphaBlend_do, dest, src, len, opa); +} +static void TVPAdditiveAlphaBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPAdditiveAlphaBlend_HDA_c, do_AddAlphaBlend, dest, src, len); +} +static void TVPAdditiveAlphaBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPAdditiveAlphaBlend_HDA_o_c, do_AddAlphaBlend_o, dest, src, len, opa); +} +static void TVPAdditiveAlphaBlend_a_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPAdditiveAlphaBlend_a_c, do_AddAlphaBlend_a, dest, src, len); +} +static void TVPAdditiveAlphaBlend_ao_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPAdditiveAlphaBlend_ao_c, do_AddAlphaBlend_ao, dest, src, len, opa); +} +static void TVPConvertAlphaToAdditiveAlpha_NEON(tjs_uint32 *dest, tjs_int len) { + do_apply_pixel(TVPConvertAlphaToAdditiveAlpha_c, do_ConvertAlphaToAdditiveAlpha, dest, len); +} + +#ifndef Region_StretchBlend + +static void TVPStretchAlphaBlend_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep) { + do_stretch_blend(TVPStretchAlphaBlend_HDA_c, do_AlphaBlend_branch, dest, len, src, srcstart, srcstep); +} +static void TVPStretchAlphaBlend_o_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep, tjs_int opa) { + do_stretch_blend(TVPStretchAlphaBlend_HDA_o_c, do_AlphaBlend_o, dest, len, src, srcstart, srcstep, opa); +} +static void TVPStretchAlphaBlend_a_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep) { + do_stretch_blend(TVPStretchAlphaBlend_a_c, do_AlphaBlend_a_branch, dest, len, src, srcstart, srcstep); +} +static void TVPStretchAlphaBlend_ao_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep, tjs_int opa) { + do_stretch_blend(TVPStretchAlphaBlend_ao_c, do_AlphaBlend_ao, dest, len, src, srcstart, srcstep, opa); +} +static void TVPStretchAlphaBlend_d_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep) { + do_stretch_blend(TVPStretchAlphaBlend_d_c, do_AlphaBlend_d_branch, dest, len, src, srcstart, srcstep); +} +static void TVPStretchAlphaBlend_do_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep, tjs_int opa) { + do_stretch_blend(TVPStretchAlphaBlend_do_c, do_AlphaBlend_do, dest, len, src, srcstart, srcstep, opa); +} +static void TVPStretchAdditiveAlphaBlend_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep) { + do_stretch_blend(TVPStretchAdditiveAlphaBlend_HDA_c, do_AddAlphaBlend, dest, len, src, srcstart, srcstep); +} +static void TVPStretchAdditiveAlphaBlend_o_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep, tjs_int opa) { + do_stretch_blend(TVPStretchAdditiveAlphaBlend_HDA_o_c, do_AddAlphaBlend_o, dest, len, src, srcstart, srcstep, opa); +} +static void TVPStretchAdditiveAlphaBlend_a_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep) { + do_stretch_blend(TVPStretchAdditiveAlphaBlend_a_c, do_AddAlphaBlend_a, dest, len, src, srcstart, srcstep); +} +static void TVPStretchAdditiveAlphaBlend_ao_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep, tjs_int opa) { + do_stretch_blend(TVPStretchAdditiveAlphaBlend_ao_c, do_AddAlphaBlend_ao, dest, len, src, srcstart, srcstep, opa); +} +static void TVPLinTransAlphaBlend_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch) { + do_lintrans_blend(TVPLinTransAlphaBlend_HDA_c, do_AlphaBlend_branch, dest, len, src, sx, sy, stepx, stepy, srcpitch); +} +static void TVPLinTransAlphaBlend_o_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch, tjs_int opa) { + do_lintrans_blend(TVPLinTransAlphaBlend_HDA_o_c, do_AlphaBlend_o, dest, len, src, sx, sy, stepx, stepy, srcpitch, opa); +} +static void TVPLinTransAlphaBlend_a_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch) { + do_lintrans_blend(TVPLinTransAlphaBlend_a_c, do_AlphaBlend_a_branch, dest, len, src, sx, sy, stepx, stepy, srcpitch); +} +static void TVPLinTransAlphaBlend_ao_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch, tjs_int opa) { + do_lintrans_blend(TVPLinTransAlphaBlend_ao_c, do_AlphaBlend_ao, dest, len, src, sx, sy, stepx, stepy, srcpitch, opa); +} +static void TVPLinTransAlphaBlend_d_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch) { + do_lintrans_blend(TVPLinTransAlphaBlend_d_c, do_AlphaBlend_d_branch, dest, len, src, sx, sy, stepx, stepy, srcpitch); +} +static void TVPLinTransAlphaBlend_do_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch, tjs_int opa) { + do_lintrans_blend(TVPLinTransAlphaBlend_do_c, do_AlphaBlend_do, dest, len, src, sx, sy, stepx, stepy, srcpitch, opa); +} +static void TVPLinTransAdditiveAlphaBlend_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch) { + do_lintrans_blend(TVPLinTransAlphaBlend_HDA_c, do_AddAlphaBlend, dest, len, src, sx, sy, stepx, stepy, srcpitch); +} +static void TVPLinTransAdditiveAlphaBlend_o_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch, tjs_int opa) { + do_lintrans_blend(TVPLinTransAlphaBlend_HDA_o_c, do_AddAlphaBlend_o, dest, len, src, sx, sy, stepx, stepy, srcpitch, opa); +} +static void TVPLinTransAdditiveAlphaBlend_a_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch) { + do_lintrans_blend(TVPLinTransAlphaBlend_a_c, do_AddAlphaBlend_a, dest, len, src, sx, sy, stepx, stepy, srcpitch); +} +static void TVPLinTransAdditiveAlphaBlend_ao_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch, tjs_int opa) { + do_lintrans_blend(TVPLinTransAlphaBlend_ao_c, do_AddAlphaBlend_ao, dest, len, src, sx, sy, stepx, stepy, srcpitch, opa); +} +static void TVPInterpStretchCopy_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src1, const tjs_uint32 *src2, tjs_int _blend_y, tjs_int srcstart, tjs_int srcstep) { + do_interp_stretch_blend(TVPInterpStretchCopy_c, do_copy_src, dest, len, src1, src2, _blend_y, srcstart, srcstep); +} +static void TVPInterpLinTransCopy_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch) { + do_interp_lintrans_blend(TVPInterpLinTransCopy_c, do_copy_src, dest, len, src, sx, sy, stepx, stepy, srcpitch); +} +static void TVPInterpStretchAdditiveAlphaBlend_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src1, const tjs_uint32 *src2, tjs_int _blend_y, tjs_int srcstart, tjs_int srcstep) { + do_interp_stretch_blend(TVPInterpStretchAdditiveAlphaBlend_c, do_AddAlphaBlend, dest, len, src1, src2, _blend_y, srcstart, srcstep); +} +static void TVPInterpStretchAdditiveAlphaBlend_o_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src1, const tjs_uint32 *src2, tjs_int _blend_y, tjs_int srcstart, tjs_int srcstep, tjs_int opa) { + do_interp_stretch_blend(TVPInterpStretchAdditiveAlphaBlend_o_c, do_AddAlphaBlend_o, dest, len, src1, src2, _blend_y, srcstart, srcstep, opa); +} +static void TVPInterpLinTransAdditiveAlphaBlend_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch) { + do_interp_lintrans_blend(TVPInterpLinTransAdditiveAlphaBlend_c, do_AddAlphaBlend, dest, len, src, sx, sy, stepx, stepy, srcpitch); +} +static void TVPInterpLinTransAdditiveAlphaBlend_o_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch, tjs_int opa) { + do_interp_lintrans_blend(TVPInterpLinTransAdditiveAlphaBlend_o_c, do_AddAlphaBlend_o, dest, len, src, sx, sy, stepx, stepy, srcpitch, opa); +} +static void TVPInterpStretchConstAlphaBlend_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src1, const tjs_uint32 *src2, tjs_int _blend_y, tjs_int srcstart, tjs_int srcstep, tjs_int opa) { + do_interp_stretch_blend(TVPInterpStretchConstAlphaBlend_c, do_ConstAlphaBlend_SD, dest, len, src1, src2, _blend_y, srcstart, srcstep, opa); +} +static void TVPInterpLinTransConstAlphaBlend_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch, tjs_int opa) { + do_interp_lintrans_blend(TVPInterpLinTransConstAlphaBlend_c, do_ConstAlphaBlend_SD, dest, len, src, sx, sy, stepx, stepy, srcpitch, opa); +} + +#endif + +static uint8x8x4_t do_CopyOpaqueImage_64(uint8x8x4_t s, uint8x8x4_t d) { + s.val[3] = vdup_n_u8(0xFF); + return s; +} +static uint8x16x4_t do_CopyOpaqueImage_128(uint8x16x4_t s) { + s.val[3] = vdupq_n_u8(0xFF); + return s; +} +static void TVPCopyOpaqueImage_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPCopyOpaqueImage_c, do_CopyOpaqueImage_64, dest, src, len); +} +static void TVPStretchCopyOpaqueImage_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep) { + do_stretch_blend(TVPStretchCopyOpaqueImage_c, do_CopyOpaqueImage_64, dest, len, src, srcstart, srcstep); +} +static void TVPLinTransCopyOpaqueImage_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch) { + do_lintrans_blend(TVPLinTransCopyOpaqueImage_c, do_CopyOpaqueImage_64, dest, len, src, sx, sy, stepx, stepy, srcpitch); +} + +#ifndef Region_ConstAlphaBlend +static void TVPConstAlphaBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPConstAlphaBlend_HDA_c, do_ConstAlphaBlend, dest, src, len, opa); +} +static void TVPConstAlphaBlend_d_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPConstAlphaBlend_d_c, do_ConstAlphaBlend_d, dest, src, len, opa); +} +static void TVPConstAlphaBlend_a_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPConstAlphaBlend_a_c, do_ConstAlphaBlend_a, dest, src, len, opa); +} + +static void TVPStretchConstAlphaBlend_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep, tjs_int opa) { + do_stretch_blend(TVPStretchConstAlphaBlend_HDA_c, do_ConstAlphaBlend, dest, len, src, srcstart, srcstep, opa); +} +static void TVPStretchConstAlphaBlend_d_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep, tjs_int opa) { + do_stretch_blend(TVPStretchConstAlphaBlend_d_c, do_ConstAlphaBlend_d, dest, len, src, srcstart, srcstep, opa); +} +static void TVPStretchConstAlphaBlend_a_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int srcstart, tjs_int srcstep, tjs_int opa) { + do_stretch_blend(TVPStretchConstAlphaBlend_a_c, do_ConstAlphaBlend_a, dest, len, src, srcstart, srcstep, opa); +} + +static void TVPLinTransConstAlphaBlend_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch, tjs_int opa) { + do_lintrans_blend(TVPLinTransConstAlphaBlend_HDA_c, do_ConstAlphaBlend, dest, len, src, sx, sy, stepx, stepy, srcpitch, opa); +} +static void TVPLinTransConstAlphaBlend_d_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch, tjs_int opa) { + do_lintrans_blend(TVPLinTransConstAlphaBlend_d_c, do_ConstAlphaBlend_d, dest, len, src, sx, sy, stepx, stepy, srcpitch, opa); +} +static void TVPLinTransConstAlphaBlend_a_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src, tjs_int sx, tjs_int sy, tjs_int stepx, tjs_int stepy, tjs_int srcpitch, tjs_int opa) { + do_lintrans_blend(TVPLinTransConstAlphaBlend_a_c, do_ConstAlphaBlend_a, dest, len, src, sx, sy, stepx, stepy, srcpitch, opa); +} + +static void TVPConstAlphaBlend_SD_NEON(tjs_uint32 *dest, const tjs_uint32 *src1, const tjs_uint32 *src2, tjs_int len, tjs_int opa) { + do_blend_2(TVPConstAlphaBlend_SD_c, do_ConstAlphaBlend_SD, dest, src1, src2, len, opa); +} +static void TVPConstAlphaBlend_SD_d_NEON(tjs_uint32 *dest, const tjs_uint32 *src1, const tjs_uint32 *src2, tjs_int len, tjs_int opa) { + do_blend_2(TVPConstAlphaBlend_SD_d_c, do_ConstAlphaBlend_SD_d, dest, src1, src2, len, opa); +} +static void TVPConstAlphaBlend_SD_a_NEON(tjs_uint32 *dest, const tjs_uint32 *src1, const tjs_uint32 *src2, tjs_int len, tjs_int opa) { + do_blend_2(TVPConstAlphaBlend_SD_a_c, do_ConstAlphaBlend_SD_a, dest, src1, src2, len, opa); +} +#endif + +#ifndef Region_UnivTransBlend +static uint8x8x4_t do_UnivTransBlend(uint8x8x4_t s2, uint8x8x4_t s1, uint8x8_t opa) { + s2.val[3] = opa; + return do_AlphaBlend(s2, s1); +} +static uint8x8x4_t do_UnivTransBlend_d(uint8x8x4_t s2, uint8x8x4_t s1, uint8x8_t opa) { + uint16x8_t s1_a16 = vmull_u8(s1.val[3], vmvn_u8(opa)); // a1*(256-opa) + uint16x8_t d_a16 = vmulq_u16(vsubl_u8(s2.val[3], s1.val[3]), vmovl_u8(opa)); + uint16x8_t o16 = vsriq_n_u16(vmull_u8(s2.val[3], opa), s1_a16, 8); // addr + s1.val[3] = vadd_u8(s1.val[3], vshrn_n_u16(d_a16, 8)); + return do_AlphaBlend_d_(s2, s1, o16); +} +static uint8x8x4_t do_UnivTransBlend_a(uint8x8x4_t s2, uint8x8x4_t s1, uint8x8_t opa) { + s1.val[3] = vadd_u8(s1.val[3], vsubhn_u16(vmull_u8(s2.val[3], opa), vmull_u8(s1.val[3], opa))); + return do_UnivTransBlend(s2, s1, opa); +} +static void TVPUnivTransBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src1, const tjs_uint32 *src2, const tjs_uint8 *rule, const tjs_uint32 *table, tjs_int len) { + do_univ_blend(TVPUnivTransBlend_c, do_UnivTransBlend, dest, src1, src2, rule, table, len); +} +static void TVPUnivTransBlend_d_NEON(tjs_uint32 *dest, const tjs_uint32 *src1, const tjs_uint32 *src2, const tjs_uint8 *rule, const tjs_uint32 *table, tjs_int len) { + do_univ_blend(TVPUnivTransBlend_d_c, do_UnivTransBlend_d, dest, src1, src2, rule, table, len); +} +static void TVPUnivTransBlend_a_NEON(tjs_uint32 *dest, const tjs_uint32 *src1, const tjs_uint32 *src2, const tjs_uint8 *rule, const tjs_uint32 *table, tjs_int len) { + do_univ_blend(TVPUnivTransBlend_a_c, do_UnivTransBlend_a, dest, src1, src2, rule, table, len); +} +static void TVPUnivTransBlend_switch_NEON(tjs_uint32 *dest, const tjs_uint32 *src1, const tjs_uint32 *src2, const tjs_uint8 *rule, const tjs_uint32 *table, tjs_int len, tjs_int src1lv, tjs_int src2lv) { + do_univ_switch(TVPUnivTransBlend_switch_c, do_UnivTransBlend, dest, src1, src2, rule, table, len, src1lv, src2lv); +} +static void TVPUnivTransBlend_switch_d_NEON(tjs_uint32 *dest, const tjs_uint32 *src1, const tjs_uint32 *src2, const tjs_uint8 *rule, const tjs_uint32 *table, tjs_int len, tjs_int src1lv, tjs_int src2lv) { + do_univ_switch(TVPUnivTransBlend_switch_d_c, do_UnivTransBlend_d, dest, src1, src2, rule, table, len, src1lv, src2lv); +} +static void TVPUnivTransBlend_switch_a_NEON(tjs_uint32 *dest, const tjs_uint32 *src1, const tjs_uint32 *src2, const tjs_uint8 *rule, const tjs_uint32 *table, tjs_int len, tjs_int src1lv, tjs_int src2lv) { + do_univ_switch(TVPUnivTransBlend_switch_a_c, do_UnivTransBlend_a, dest, src1, src2, rule, table, len, src1lv, src2lv); +} +#endif + +#ifndef Region_ApplyColorMap + +static uint8x8x4_t do_mergeColorSrc(uint8x8_t s, tjs_uint32 color) { + uint8x8x4_t src; + src.val[2] = vdup_n_u8((color >> 16) & 0xFF); + src.val[1] = vdup_n_u8((color >> 8) & 0xFF); + src.val[0] = vdup_n_u8((color >> 0) & 0xFF); + src.val[3] = s; + return src; +} + +static uint8x8x4_t do_ApplyColorMap(uint8x8_t s, uint8x8x4_t d, tjs_uint32 color) { + uint8x8x4_t src = do_mergeColorSrc(s, color); + return do_AlphaBlend(src, d); +} +static uint8x8x4_t do_ApplyColorMap_o(uint8x8_t s, uint8x8x4_t d, tjs_uint32 color, tjs_int opa) { + uint8x8x4_t src = do_mergeColorSrc(s, color); + return do_AlphaBlend_o(src, d, opa); +} +static uint8x8x4_t do_ApplyColorMap_d(uint8x8_t s, uint8x8x4_t d, tjs_uint32 color) { + uint16x8_t s_a16 = vshll_n_u8(s, 8); + uint16x8_t isd_a16 = vmull_u8(vmvn_u8(s), vmvn_u8(d.val[3])); + uint16x8_t sopa = vorrq_u16(s_a16, vmovl_u8(d.val[3])); + uint8x8x4_t src = do_mergeColorSrc(s, color); + d.val[3] = vmvn_u8(vshrn_n_u16(isd_a16, 8)); + return do_AlphaBlend_d_(src, d, sopa); +} +static uint8x8x4_t do_ApplyColorMap_a(uint8x8_t s, uint8x8x4_t d, tjs_uint32 color) { + uint8x8x4_t src = do_mergeColorSrc(s, color); + return do_AlphaBlend_a(src, d); +} +static uint8x8x4_t do_ApplyColorMap_do(uint8x8_t s, uint8x8x4_t d, tjs_uint32 color, tjs_int opa) { + uint16x8_t s_a16 = vmull_u8(s, vdup_n_u8(opa)); + s = vshrn_n_u16(s_a16, 8); + return do_ApplyColorMap_d(s, d, color); +} +static uint8x8x4_t do_ApplyColorMap_ao(uint8x8_t s, uint8x8x4_t d, tjs_uint32 color, tjs_int opa) { + uint16x8_t s_a16 = vmull_u8(s, vdup_n_u8(opa)); + s = vshrn_n_u16(s_a16, 8); + return do_ApplyColorMap_a(s, d, color); +} + +static void TVPApplyColorMap_NEON(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len, tjs_uint32 color) { + do_blend_lum(TVPApplyColorMap_HDA_c, do_ApplyColorMap, dest, src, len, color); +} +static void TVPApplyColorMap_o_NEON(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len, tjs_uint32 color, tjs_int opa) { + do_blend_lum(TVPApplyColorMap_HDA_o_c, do_ApplyColorMap_o, dest, src, len, color, opa); +} +static void TVPApplyColorMap_d_NEON(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len, tjs_uint32 color) { + do_blend_lum(TVPApplyColorMap_d_c, do_ApplyColorMap_d, dest, src, len, color); +} +static void TVPApplyColorMap_a_NEON(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len, tjs_uint32 color) { + do_blend_lum(TVPApplyColorMap_a_c, do_ApplyColorMap_a, dest, src, len, color); +} +static void TVPApplyColorMap_do_NEON(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len, tjs_uint32 color, tjs_int opa) { + do_blend_lum(TVPApplyColorMap_do_c, do_ApplyColorMap_do, dest, src, len, color, opa); +} +static void TVPApplyColorMap_ao_NEON(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len, tjs_uint32 color, tjs_int opa) { + do_blend_lum(TVPApplyColorMap_ao_c, do_ApplyColorMap_ao, dest, src, len, color, opa); +} +#endif + +#ifndef Region_ConstColorAlphaBlend +static uint8x8x4_t do_ConstColorAlphaBlend(uint8x8x4_t d, tjs_uint32 color, tjs_int opa) { + uint16x8_t s_r16 = vdupq_n_u16(((color >> 16) & 0xFF) * opa); + uint16x8_t s_g16 = vdupq_n_u16(((color >> 8) & 0xFF) * opa); + uint16x8_t s_b16 = vdupq_n_u16(((color >> 0) & 0xFF) * opa); + uint8x8_t s_ia8 = vdup_n_u8(opa ^ 0xFF); + uint16x8_t d_r16 = vmull_u8(d.val[2], s_ia8); + uint16x8_t d_g16 = vmull_u8(d.val[1], s_ia8); + uint16x8_t d_b16 = vmull_u8(d.val[0], s_ia8); + d.val[2] = vshrn_n_u16(vaddq_u16(d_r16, s_r16), 8); + d.val[1] = vshrn_n_u16(vaddq_u16(d_g16, s_g16), 8); + d.val[0] = vshrn_n_u16(vaddq_u16(d_b16, s_b16), 8); + return d; +} +static uint8x8x4_t do_ConstColorAlphaBlend_d(uint8x8x4_t d, tjs_uint32 color, tjs_int opa) { + uint16x8_t hopa16 = vdupq_n_u16(opa << 8); + uint8x8_t s_ia8 = vdup_n_u8(opa ^ 0xFF); + uint8x8x4_t s; + s.val[2] = vdup_n_u8((color >> 16) & 0xFF); + s.val[1] = vdup_n_u8((color >> 8) & 0xFF); + s.val[0] = vdup_n_u8((color >> 0) & 0xFF); + uint16x8_t isd_a16 = vmull_u8(s_ia8, vmvn_u8(d.val[3])); + uint16x8_t s_a16 = vorrq_u16(hopa16, vmovl_u8(d.val[3])); + d.val[3] = vmvn_u8(vshrn_n_u16(isd_a16, 8)); //(255-((255-dopa)*(255-opa)>>8)) + return do_AlphaBlend_d_(s, d, s_a16); +} +static uint8x8x4_t do_ConstColorAlphaBlend_a(uint8x8x4_t d, tjs_uint32 color, tjs_int opa) { + uint8x8x4_t s; + s.val[2] = vdup_n_u8((((color >> 16) & 0xFF) * opa) >> 8); + s.val[1] = vdup_n_u8((((color >> 8) & 0xFF) * opa) >> 8); + s.val[0] = vdup_n_u8((((color >> 0) & 0xFF) * opa) >> 8); + s.val[3] = vdup_n_u8(opa); + d = do_AlphaBlend_da(s, d); + return do_AddAlphaBlend(s, d); +} +static void TVPConstColorAlphaBlend_NEON(tjs_uint32 *dest, tjs_int len, tjs_uint32 color, tjs_int opa) { + do_apply_pixel(TVPConstColorAlphaBlend_c, do_ConstColorAlphaBlend, dest, len, color, opa); +} +static void TVPConstColorAlphaBlend_d_NEON(tjs_uint32 *dest, tjs_int len, tjs_uint32 color, tjs_int opa) { + do_apply_pixel(TVPConstColorAlphaBlend_d_c, do_ConstColorAlphaBlend_d, dest, len, color, opa); +} +static void TVPConstColorAlphaBlend_a_NEON(tjs_uint32 *dest, tjs_int len, tjs_uint32 color, tjs_int opa) { + do_apply_pixel(TVPConstColorAlphaBlend_a_c, do_ConstColorAlphaBlend_a, dest, len, color, opa); +} +#endif + +static uint8x8x4_t do_RemoveConstOpacity(uint8x8x4_t d, tjs_int opa) { + d.val[3] = vshrn_n_u16(vmull_u8(d.val[3], vdup_n_u8(opa)), 8); + return d; +} +static uint8x8x4_t do_RemoveOpacity(uint8x8_t s, uint8x8x4_t d) { + d.val[3] = vshrn_n_u16(vmull_u8(d.val[3], vmvn_u8(s)), 8); + return d; +} +static uint8x8x4_t do_RemoveOpacity_o(uint8x8_t s, uint8x8x4_t d, tjs_int _strength) { + uint8x8_t strength = vdup_n_u8(_strength); + uint16x8_t s16 = vmull_u8(s, strength); // s * str(8pix) + s16 = vmull_u8(vshrn_n_u16(vmvnq_u16(s16), 8), d.val[3]); // da * (65535 - s * str) + d.val[3] = vshrn_n_u16(s16, 8); + return d; +} +static void TVPRemoveConstOpacity_NEON(tjs_uint32 *dest, tjs_int len, tjs_int strength) +{ + do_apply_pixel(TVPRemoveConstOpacity_c, do_RemoveConstOpacity, dest, len, 255 - strength); +} +static void TVPRemoveOpacity_NEON(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len) +{ + do_blend_lum(TVPRemoveOpacity_c, do_RemoveOpacity, dest, src, len); +} +static void TVPRemoveOpacity_o_NEON(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len, tjs_int _strength) +{ + do_blend_lum(TVPRemoveOpacity_o_c, do_RemoveOpacity_o, dest, src, len, _strength); +} + +#ifndef Region_AddBlend +static uint8x16x4_t do_AddBlend_HDA_128(uint8x16x4_t s, uint8x16x4_t d) { + d.val[2] = vqaddq_u8(d.val[2], s.val[2]); + d.val[1] = vqaddq_u8(d.val[1], s.val[1]); + d.val[0] = vqaddq_u8(d.val[0], s.val[0]); + return d; +} +static uint8x16x4_t do_AddBlend_NonHDA_128(uint8x16x4_t s, uint8x16x4_t d) { + d = do_AddBlend_HDA_128(s, d); + d.val[3] = vqaddq_u8(d.val[3], s.val[3]); + return d; +} +static uint8x8x4_t do_AddBlend_o(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + s.val[3] = vdup_n_u8(opa); + s = do_ConvertAlphaToAdditiveAlpha(s); + d.val[2] = vqadd_u8(d.val[2], s.val[2]); + d.val[1] = vqadd_u8(d.val[1], s.val[1]); + d.val[0] = vqadd_u8(d.val[0], s.val[0]); + return d; +} +static void TVPAddBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend_128(TVPAddBlend_c, do_AddBlend_NonHDA_128, dest, src, len); +} +static void TVPAddBlend_HDA_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend_128(TVPAddBlend_HDA_c, do_AddBlend_HDA_128, dest, src, len); +} +static void TVPAddBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPAddBlend_HDA_o_c, do_AddBlend_o, dest, src, len, opa); +} +#endif + +#ifndef Region_SubBlend +static uint8x16x4_t do_SubBlend_HDA(uint8x16x4_t s, uint8x16x4_t d) { + d.val[2] = vqsubq_u8(d.val[2], vmvnq_u8(s.val[2])); + d.val[1] = vqsubq_u8(d.val[1], vmvnq_u8(s.val[1])); + d.val[0] = vqsubq_u8(d.val[0], vmvnq_u8(s.val[0])); + return d; +} +static uint8x16x4_t do_SubBlend_NonHDA(uint8x16x4_t s, uint8x16x4_t d) { + d = do_SubBlend_HDA(s, d); + d.val[3] = vqsubq_u8(d.val[3], vmvnq_u8(s.val[3])); + return d; +} +static uint8x8x4_t do_SubBlend_o(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + uint8x8_t opa8 = vdup_n_u8(opa); + s.val[2] = vmvn_u8(s.val[2]); + s.val[1] = vmvn_u8(s.val[1]); + s.val[0] = vmvn_u8(s.val[0]); + s.val[2] = vshrn_n_u16(vmull_u8(s.val[2], opa8), 8); + s.val[1] = vshrn_n_u16(vmull_u8(s.val[1], opa8), 8); + s.val[0] = vshrn_n_u16(vmull_u8(s.val[0], opa8), 8); + d.val[2] = vqsub_u8(d.val[2], s.val[2]); + d.val[1] = vqsub_u8(d.val[1], s.val[1]); + d.val[0] = vqsub_u8(d.val[0], s.val[0]); + return d; +} +static void TVPSubBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend_128(TVPSubBlend_c, do_SubBlend_NonHDA, dest, src, len); +} +static void TVPSubBlend_HDA_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend_128(TVPSubBlend_HDA_c, do_SubBlend_HDA, dest, src, len); +} +static void TVPSubBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPSubBlend_HDA_o_c, do_SubBlend_o, dest, src, len, opa); +} +#endif + +#ifndef Region_MulBlend +static uint8x8x4_t do_MulBlend_HDA(uint8x8x4_t s, uint8x8x4_t d) { + d.val[2] = vshrn_n_u16(vmull_u8(s.val[2], d.val[2]), 8); + d.val[1] = vshrn_n_u16(vmull_u8(s.val[1], d.val[1]), 8); + d.val[0] = vshrn_n_u16(vmull_u8(s.val[0], d.val[0]), 8); + return d; +} +static uint8x8x4_t do_MulBlend_o_HDA(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + uint8x8_t opa8 = vdup_n_u8(opa); + uint16x8_t s_r16 = vmull_u8(vmvn_u8(s.val[2]), opa8); + uint16x8_t s_g16 = vmull_u8(vmvn_u8(s.val[1]), opa8); + uint16x8_t s_b16 = vmull_u8(vmvn_u8(s.val[0]), opa8); + s.val[2] = vmvn_u8(vshrn_n_u16(s_r16, 8)); + s.val[1] = vmvn_u8(vshrn_n_u16(s_g16, 8)); + s.val[0] = vmvn_u8(vshrn_n_u16(s_b16, 8)); + return do_MulBlend_HDA(s, d); +} +static uint8x8x4_t do_MulBlend_NonHDA(uint8x8x4_t s, uint8x8x4_t d) { + d = do_MulBlend_HDA(s, d); + d.val[3] = vdup_n_u8(0); + return d; +} +static uint8x8x4_t do_MulBlend_o_NonHDA(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + d = do_MulBlend_o_HDA(s, d, opa); + d.val[3] = vdup_n_u8(0); + return d; +} +static void TVPMulBlend_HDA_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPMulBlend_HDA_c, do_MulBlend_HDA, dest, src, len); +} +static void TVPMulBlend_HDA_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPMulBlend_HDA_o_c, do_MulBlend_o_HDA, dest, src, len, opa); +} +static void TVPMulBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPMulBlend_c, do_MulBlend_NonHDA, dest, src, len); +} +static void TVPMulBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPMulBlend_o_c, do_MulBlend_o_NonHDA, dest, src, len, opa); +} +#endif +static uint8x8x4_t do_ColorDodgeBlend(uint8x8x4_t s_argb8, uint8x8x4_t d_argb8) { + uint8_t tmpbuff[16 + 8]; + uint8_t *tmpb = (uint8_t *)__builtin_assume_aligned((uint8_t*)((((intptr_t)tmpbuff) + 7) & ~7), 8); + for (int i = 0; i < 3; ++i) { + // d = d * 255 / (255 - s) + s_argb8.val[i] = vmvn_u8(s_argb8.val[i]); + uint16x8_t tmp = vsubl_u8(s_argb8.val[i], d_argb8.val[i]); + uint8x8_t mask = vshrn_n_u16(tmp, 8); // 00 or FF + vst1_u8(tmpb, s_argb8.val[i]); + tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[0]], tmp, 0); + tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[1]], tmp, 1); + tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[2]], tmp, 2); + tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[3]], tmp, 3); + tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[4]], tmp, 4); + tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[5]], tmp, 5); + tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[6]], tmp, 6); + tmp = vsetq_lane_u16(TVPRecipTableForOpacityOnOpacity[tmpb[7]], tmp, 7); + tmp = vmulq_u16(vmovl_u8(d_argb8.val[i]), tmp); + d_argb8.val[i] = vorr_u8(vshrn_n_u16(tmp, 8), mask); + } + return d_argb8; +} +static uint8x8x4_t do_ColorDodgeBlend_o(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + s.val[3] = vdup_n_u8(opa); + s = do_ConvertAlphaToAdditiveAlpha(s); + return do_ColorDodgeBlend(s, d); +} +static void TVPColorDodgeBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPColorDodgeBlend_HDA_c, do_ColorDodgeBlend, dest, src, len); +} +static void TVPColorDodgeBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPColorDodgeBlend_HDA_o_c, do_ColorDodgeBlend_o, dest, src, len, opa); +} + +static uint8x16x4_t do_DarkenBlend(uint8x16x4_t s, uint8x16x4_t d) { + d.val[0] = vminq_u8(s.val[0], d.val[0]); + d.val[1] = vminq_u8(s.val[1], d.val[1]); + d.val[2] = vminq_u8(s.val[2], d.val[2]); + return d; +} +static uint8x8x4_t do_DarkenBlend_o(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + s.val[2] = vmin_u8(s.val[2], d.val[2]); + s.val[1] = vmin_u8(s.val[1], d.val[1]); + s.val[0] = vmin_u8(s.val[0], d.val[0]); + s.val[3] = vdup_n_u8(opa); + return do_AlphaBlend(s, d); +} +static uint8x16x4_t do_LightenBlend(uint8x16x4_t s, uint8x16x4_t d) { + d.val[0] = vmaxq_u8(s.val[0], d.val[0]); + d.val[1] = vmaxq_u8(s.val[1], d.val[1]); + d.val[2] = vmaxq_u8(s.val[2], d.val[2]); + return d; +} +static uint8x8x4_t do_LightenBlend_o(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + s.val[2] = vmax_u8(s.val[2], d.val[2]); + s.val[1] = vmax_u8(s.val[1], d.val[1]); + s.val[0] = vmax_u8(s.val[0], d.val[0]); + s.val[3] = vdup_n_u8(opa); + return do_AlphaBlend(s, d); +} +static uint8x8x4_t do_ScreenBlend(uint8x8x4_t s_argb8, uint8x8x4_t d_argb8) { + uint16x8_t d_r16 = vmull_u8(vmvn_u8(s_argb8.val[2]), vmvn_u8(d_argb8.val[2])); + uint16x8_t d_g16 = vmull_u8(vmvn_u8(s_argb8.val[1]), vmvn_u8(d_argb8.val[1])); + uint16x8_t d_b16 = vmull_u8(vmvn_u8(s_argb8.val[0]), vmvn_u8(d_argb8.val[0])); + d_argb8.val[2] = vmvn_u8(vshrn_n_u16(d_r16, 8)); + d_argb8.val[1] = vmvn_u8(vshrn_n_u16(d_g16, 8)); + d_argb8.val[0] = vmvn_u8(vshrn_n_u16(d_b16, 8)); + return d_argb8; +} +static uint8x8x4_t do_ScreenBlend_o(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + s.val[3] = vdup_n_u8(opa); + s = do_ConvertAlphaToAdditiveAlpha(s); + return do_ScreenBlend(s, d); +} +static void TVPDarkenBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend_128(TVPDarkenBlend_HDA_c, do_DarkenBlend, dest, src, len); +} +static void TVPDarkenBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPDarkenBlend_HDA_o_c, do_DarkenBlend_o, dest, src, len, opa); +} +static void TVPLightenBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend_128(TVPLightenBlend_HDA_c, do_LightenBlend, dest, src, len); +} +static void TVPLightenBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPDarkenBlend_HDA_o_c, do_LightenBlend_o, dest, src, len, opa); +} +static void TVPScreenBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPScreenBlend_HDA_c, do_ScreenBlend, dest, src, len); +} +static void TVPScreenBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPScreenBlend_HDA_o_c, do_ScreenBlend_o, dest, src, len, opa); +} + +static void TVPFastLinearInterpV2_NEON(tjs_uint32 *dest, tjs_int len, const tjs_uint32 *src0, const tjs_uint32 *src1) +{ + tjs_uint32* pEndDst = dest + len; + { + tjs_int PreFragLen = ((-(((tjs_int)(intptr_t)dest) & 7)) & 7) / sizeof(*dest); + if(PreFragLen > len) PreFragLen = len; + if(PreFragLen) { + TVPFastLinearInterpV2_c(dest, PreFragLen, src0, src1); + dest += PreFragLen; + src0 += PreFragLen; + src1 += PreFragLen; + } + } + + tjs_uint32* pVecEndDst = pEndDst-3; + if ((((intptr_t)src0) & 7) || (((intptr_t)src1) & 7)) { + while (dest < pVecEndDst) { + uint8x16_t s0 = vld1q_u8((uint8_t *)__builtin_assume_aligned(src0, 4)); + uint8x16_t s1 = vld1q_u8((uint8_t *)__builtin_assume_aligned(src1, 4)); + + vst1q_u8((uint8_t *)__builtin_assume_aligned(dest, 8), vhaddq_u8(s0, s1)); + dest += 4; + src0 += 4; + src1 += 4; + } + } else { + while (dest < pVecEndDst) { + uint8x16_t s0 = vld1q_u8((uint8_t *)__builtin_assume_aligned(src0, 8)); + uint8x16_t s1 = vld1q_u8((uint8_t *)__builtin_assume_aligned(src1, 8)); + + vst1q_u8((uint8_t *)__builtin_assume_aligned(dest, 8), vhaddq_u8(s0, s1)); + dest += 4; + src0 += 4; + src1 += 4; + } + } + + if(dest < pEndDst) { + TVPFastLinearInterpV2_c(dest, pEndDst - dest, src0, src1); + } +} +static uint8x8x4_t do_CopyMask(uint8x8x4_t s, uint8x8x4_t d) { + d.val[3] = s.val[3]; + return d; +} +static void TVPCopyMask_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPCopyMask_c, do_CopyMask, dest, src, len); +} +static uint8x8x4_t do_CopyColor(uint8x8x4_t s, uint8x8x4_t d) { + s.val[3] = d.val[3]; + return s; +} +static void TVPCopyColor_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPCopyColor_c, do_CopyColor, dest, src, len); +} +static uint8x8x4_t do_BindMaskToMain(uint8x8_t s, uint8x8x4_t d) { + d.val[3] = s; + return d; +} +static void TVPBindMaskToMain_NEON(tjs_uint32 *main, const tjs_uint8 *mask, tjs_int len) { + do_blend_lum(TVPBindMaskToMain_c, do_BindMaskToMain, main, mask, len); +} +static void TVPFillARGB_NEON(tjs_uint32 *dest, tjs_int len, tjs_uint32 value) +{ + tjs_uint32* pEndDst = dest + len; + while((((intptr_t)dest)&~15) && dest < pEndDst) { + *dest++ = value; + } + + uint32x4_t v = vdupq_n_u32(value); + tjs_uint32* pVecEndDst = pEndDst-3; + while(dest < pVecEndDst) { + vst1q_u32((uint32_t *)__builtin_assume_aligned(dest, 16), v); + dest += 4; + } + while(dest < pEndDst) { + *dest++ = value; + } +} +static uint8x8x4_t do_FillColor(uint8x8x4_t d, tjs_uint32 color) { + d.val[0] = vdup_n_u8(color & 0xff); + d.val[1] = vdup_n_u8((color >> 8) & 0xff); + d.val[2] = vdup_n_u8((color >> 16) & 0xff); + return d; +} + +static void TVPFillColor_NEON(tjs_uint32 *dest, tjs_int len, tjs_uint32 color) { + do_apply_pixel(TVPFillColor_c, do_FillColor, dest, len, color); +} +static uint8x8x4_t do_FillMask(uint8x8x4_t d, tjs_uint32 mask) { + d.val[3] = vdup_n_u8(mask); + return d; +} +static void TVPFillMask_NEON(tjs_uint32 *dest, tjs_int len, tjs_uint32 mask) { + do_apply_pixel(TVPFillMask_c, do_FillMask, dest, len, mask); +} + +static void TVPAddSubVertSum16_NEON(tjs_uint16 *dest, const tjs_uint32 *addline, const tjs_uint32 *subline, tjs_int len) +{ + tjs_uint16* pEndDst = dest + len * 4; + // dest is always aligned +// { +// tjs_int PreFragLen = ((-(((tjs_int)(intptr_t)dest) & 7)) & 7) / sizeof(*dest) / 4; +// if(PreFragLen > len) PreFragLen = len; +// if(PreFragLen) { +// TVPAddSubVertSum16_c(dest, addline, subline, PreFragLen); +// dest += PreFragLen * 4; +// addline += PreFragLen; +// subline += PreFragLen; +// } +// } + + tjs_uint16* pVecEndDst = pEndDst-7; + if ((((intptr_t)addline) & 7) || (((intptr_t)subline) & 7)) { + while (dest < pVecEndDst) { + uint8x8x4_t add = vld4_u8((uint8_t *)__builtin_assume_aligned(addline, 4)); + uint8x8x4_t sub = vld4_u8((uint8_t *)__builtin_assume_aligned(subline, 4)); + uint16x8x4_t d = vld4q_u16((uint16_t *)__builtin_assume_aligned(dest, 8)); + d.val[3] = vaddq_u16(d.val[3], vsubl_u8(add.val[3], sub.val[3])); + d.val[2] = vaddq_u16(d.val[2], vsubl_u8(add.val[2], sub.val[2])); + d.val[1] = vaddq_u16(d.val[1], vsubl_u8(add.val[1], sub.val[1])); + d.val[0] = vaddq_u16(d.val[0], vsubl_u8(add.val[0], sub.val[0])); + vst4q_u16((uint16_t *)__builtin_assume_aligned(dest, 8), d); + dest += 8 * 4; + addline += 8; + subline += 8; + } + } else { + while (dest < pVecEndDst) { + uint8x8x4_t add = vld4_u8((uint8_t *)__builtin_assume_aligned(addline, 8)); + uint8x8x4_t sub = vld4_u8((uint8_t *)__builtin_assume_aligned(subline, 8)); + uint16x8x4_t d = vld4q_u16((uint16_t *)__builtin_assume_aligned(dest, 8)); + d.val[3] = vaddq_u16(d.val[3], vsubl_u8(add.val[3], sub.val[3])); + d.val[2] = vaddq_u16(d.val[2], vsubl_u8(add.val[2], sub.val[2])); + d.val[1] = vaddq_u16(d.val[1], vsubl_u8(add.val[1], sub.val[1])); + d.val[0] = vaddq_u16(d.val[0], vsubl_u8(add.val[0], sub.val[0])); + vst4q_u16((uint16_t *)__builtin_assume_aligned(dest, 8), d); + dest += 8 * 4; + addline += 8; + subline += 8; + } + } + + if(dest < pEndDst) { + TVPAddSubVertSum16_c(dest, addline, subline, (pEndDst - dest) / 4); + } +} + +static void TVPAddSubVertSum16_d_NEON(tjs_uint16 *dest, const tjs_uint32 *addline, const tjs_uint32 *subline, tjs_int len) +{ + tjs_uint16* pEndDst = dest + len * 4; + // dest is always aligned +// { +// tjs_int PreFragLen = (((-(((tjs_int)(intptr_t)dest) & 7)) & 7) / sizeof(*dest)) / 4; +// if(PreFragLen > len) PreFragLen = len; +// if(PreFragLen) { +// TVPAddSubVertSum16_d_c(dest, addline, subline, PreFragLen); +// dest += PreFragLen * 4; +// addline += PreFragLen; +// subline += PreFragLen; +// } +// } + + tjs_uint16* pVecEndDst = pEndDst-7; + while (dest < pVecEndDst) { + uint8x8x4_t add = vld4_u8((uint8_t *)__builtin_assume_aligned(addline, 4)); + uint8x8x4_t sub = vld4_u8((uint8_t *)__builtin_assume_aligned(subline, 4)); + uint16x8x4_t d = vld4q_u16((uint16_t*)__builtin_assume_aligned(dest, 8)); + + uint16x8_t add_a = vaddl_u8(add.val[3], vshr_n_u8(add.val[3], 7)); + uint16x8_t sub_a = vaddl_u8(sub.val[3], vshr_n_u8(sub.val[3], 7)); + d.val[3] = vaddq_u16(d.val[3], vsubl_u8(add.val[3], sub.val[3])); + + uint16x8_t add_16 = vmulq_u16(vmovl_u8(add.val[2]), add_a); + uint16x8_t sub_16 = vmulq_u16(vmovl_u8(sub.val[2]), sub_a); + add_16 = vshrq_n_u16(add_16, 8); + sub_16 = vshrq_n_u16(sub_16, 8); + d.val[2] = vaddq_u16(d.val[2], vsubq_u16(add_16, sub_16)); + + add_16 = vmulq_u16(vmovl_u8(add.val[1]), add_a); + sub_16 = vmulq_u16(vmovl_u8(sub.val[1]), sub_a); + add_16 = vshrq_n_u16(add_16, 8); + sub_16 = vshrq_n_u16(sub_16, 8); + d.val[1] = vaddq_u16(d.val[1], vsubq_u16(add_16, sub_16)); + + add_16 = vmulq_u16(vmovl_u8(add.val[0]), add_a); + sub_16 = vmulq_u16(vmovl_u8(sub.val[0]), sub_a); + add_16 = vshrq_n_u16(add_16, 8); + sub_16 = vshrq_n_u16(sub_16, 8); + d.val[0] = vaddq_u16(d.val[0], vsubq_u16(add_16, sub_16)); + + vst4q_u16((uint16_t*)__builtin_assume_aligned(dest, 8), d); + dest += 8 * 4; + addline += 8; + subline += 8; + } + + if(dest < pEndDst) { + TVPAddSubVertSum16_d_c(dest, addline, subline, (pEndDst - dest) / 4); + } +} + +static void TVPDoBoxBlurAvg16_NEON(tjs_uint32 *dest, tjs_uint16 *_sum, const tjs_uint16 * add, const tjs_uint16 * sub, tjs_int n, tjs_int len) +{ + tjs_uint32* pEndDst = dest + len; + // dest is always aligned +// { +// tjs_int PreFragLen = ((-(((tjs_int)(intptr_t)dest) & 7)) & 7) / sizeof(*dest); +// if(PreFragLen > len) PreFragLen = len; +// if(PreFragLen) { +// TVPDoBoxBlurAvg16_c(dest, _sum, add, sub, n, PreFragLen); +// dest += PreFragLen; +// add += PreFragLen; +// sub += PreFragLen; +// } +// } + + static const int32_t c_shl_n[4] = { 0, 8, 16, 24 }; + + uint32x4_t rcp = vdupq_n_u32((1<<16) / n); + int32x4_t shl_n = vld1q_s32(c_shl_n); + uint16x4_t half_n = vdup_n_u16(n >> 1); + + tjs_uint32* pVecEndDst = pEndDst-7; + uint16x4_t sum = vld1_u16(_sum); + while (dest < pVecEndDst) { + uint32x4_t t = vmulq_u32(vaddl_u16(sum, half_n), rcp); + uint32x4_t d = vshlq_u32(vshrq_n_u32(t, 16), shl_n); + +// t0 = vmul_u32(vaddl_u16(src_sum.val[2], half_n)); +// vorr_u32(d, vshl_n_u32(vshr_n_u32(t1, 16), 8)); +// t1 = vmul_u32(vaddl_u16(src_sum.val[3], half_n)); +// vorr_u32(d, vshl_n_u32(vshr_n_u32(t0, 16), 8)); +// vorr_u32(d, vshl_n_u32(vshr_n_u32(t1, 16), 8)); +// +// uint16x4x4_t src_add = vld4_u16 + +// uint16x8_t add = vld1q_u16(add); +// uint16x8_t sub = vld1q_u16(sub); +// uint16x8_t d = vld4q_u16(dest); +// d.val[3] = vaddq_u16(d.val[3], vsubl_u8(add.val[3], sub.val[3])); +// d.val[2] = vaddq_u16(d.val[2], vsubl_u8(add.val[2], sub.val[2])); +// d.val[1] = vaddq_u16(d.val[1], vsubl_u8(add.val[1], sub.val[1])); +// d.val[0] = vaddq_u16(d.val[0], vsubl_u8(add.val[0], sub.val[0])); + vst1q_u32(dest, d); + dest += 8; + add += 8; + sub += 8; + } + + if(dest < pEndDst) { + TVPDoBoxBlurAvg16_c(dest, _sum, add, sub, n, pEndDst - dest); + } +} + +static uint8x8x4_t do_Expand8BitTo32BitGray(uint8x8_t s, uint8x8x4_t d) { + d.val[0] = s; + d.val[1] = s; + d.val[2] = s; + d.val[3] = vdup_n_u8(0xFF); + return d; +} +static void TVPExpand8BitTo32BitGray_NEON(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len) { + do_blend_lum(TVPExpand8BitTo32BitGray_c, do_Expand8BitTo32BitGray, dest, src, len); +} +static uint8x16x4_t do_ReverseRGB(uint8x16x4_t s, uint8x16x4_t d) { + uint8x16_t t = s.val[0]; + s.val[0] = s.val[2]; + s.val[2] = t; + return s; +} +static void TVPReverseRGB_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend_128(TVPReverseRGB_c, do_ReverseRGB, dest, src, len); +} + +static void TVPUpscale65_255_NEON(tjs_uint8 *dest, tjs_int len) { + // dest is already aligned by 16 bytes + tjs_uint8* pEndDst = dest + len; + tjs_uint8* pVecEndDst = pEndDst - 15; + + while (dest < pVecEndDst) { + uint8x16_t d = vld1q_u8((uint8_t *)__builtin_assume_aligned(dest, 16)); + d = vqshlq_n_u8(d, 2); + vst1q_u8((uint8_t *)__builtin_assume_aligned(dest, 16), d); + dest += 16; + } + while (dest < pEndDst) { + tjs_uint c = *dest << 2; + *dest = c > 255 ? 255 : c; + ++dest; + } +} + +static const unsigned char rgb555_lut[4][8] = { + { 0, 0x8, 0x10, 0x18, 0x21, 0x29, 0x31, 0x39 }, + { 0x42, 0x4A, 0x52, 0x5A, 0x63, 0x6B, 0x73, 0x7B }, + { 0x84, 0x8C, 0x94, 0x9C, 0xA5, 0xAD, 0xB5, 0xBD }, + { 0xC6, 0xCE, 0xD6, 0xDE, 0xE7, 0xEF, 0xF7, 0xFF } }; + +static void TVPBLConvert15BitTo32Bit_NEON(tjs_uint32 *dest, const tjs_uint16 *src, tjs_int len) +{ + tjs_uint32* pEndDst = dest + len; + { + tjs_int PreFragLen = ((-(((tjs_int)(intptr_t)dest) & 7)) & 7) / sizeof(*dest); + if (PreFragLen > len) PreFragLen = len; + if (PreFragLen) { + TVPBLConvert15BitTo32Bit_c(dest, src, PreFragLen); + dest += PreFragLen; + src += PreFragLen; + } + } + + tjs_uint32* pVecEndDst = pEndDst - 7; + + if (dest < pVecEndDst) + { + uint8x8x4_t d; + d.val[3] = vdup_n_u8(0xFF); +#if 0 //def __LP64__ + uint8x16x2_t lut; + lut.val[0] = vld1q_u8(rgb555_lut[0]); + lut.val[1] = vld1q_u8(rgb555_lut[2]); + + while (dest < pVecEndDst) { + uint16x8_t s = vshlq_n_u16(vld1q_u16(src), 1); + d.val[0] = vtbl2q_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); + s = vshlq_n_u16(s, 5); + d.val[1] = vtbl2q_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); + s = vshlq_n_u16(s, 5); + d.val[2] = vtbl2q_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); + vst4_u8((uint8_t*)dest, d); + dest += 8; + src += 8; + } +#else + uint8x8x4_t lut; + lut.val[0] = vld1_u8(rgb555_lut[0]); + lut.val[1] = vld1_u8(rgb555_lut[1]); + lut.val[2] = vld1_u8(rgb555_lut[2]); + lut.val[3] = vld1_u8(rgb555_lut[3]); + + if (((intptr_t)src) & 7) { + while (dest < pVecEndDst) { + uint16x8_t s = vshlq_n_u16(vld1q_u16((uint16_t *)__builtin_assume_aligned(src, 4)), 1); + d.val[0] = vtbl4_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); + s = vshlq_n_u16(s, 5); + d.val[1] = vtbl4_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); + s = vshlq_n_u16(s, 5); + d.val[2] = vtbl4_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); + vst4_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d); + dest += 8; + src += 8; + } + } else { + while (dest < pVecEndDst) { + uint16x8_t s = vshlq_n_u16(vld1q_u16((uint16_t *)__builtin_assume_aligned(src, 8)), 1); + d.val[0] = vtbl4_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); + s = vshlq_n_u16(s, 5); + d.val[1] = vtbl4_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); + s = vshlq_n_u16(s, 5); + d.val[2] = vtbl4_u8(lut, vmovn_u16(vshrq_n_u16(s, 11))); + vst4_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d); + dest += 8; + src += 8; + } + } +#endif + } + + if (dest < pEndDst) { + TVPBLConvert15BitTo32Bit_c(dest, src, pEndDst - dest); + } +} + +static void TVPConvert24BitTo32Bit_NEON(tjs_uint32 *dest, const tjs_uint8 *src, tjs_int len) +{ + tjs_uint32* pEndDst = dest + len; + { + tjs_int PreFragLen = ((-(((tjs_int)(intptr_t)dest) & 7)) & 7) / sizeof(*dest); + if(PreFragLen > len) PreFragLen = len; + if(PreFragLen) { + TVPConvert24BitTo32Bit_c(dest, src, PreFragLen); + dest += PreFragLen; + src += PreFragLen * 3; + } + } + + tjs_uint32* pVecEndDst = pEndDst-15; + uint8x16x4_t d; + d.val[3] = vdupq_n_u8(0xFF); + if (((intptr_t)src) & 7) { + while (dest < pVecEndDst) { + uint8x16x3_t s = vld3q_u8((uint8_t *)__builtin_assume_aligned(src, 4)); + d.val[2] = s.val[0]; + d.val[1] = s.val[1]; + d.val[0] = s.val[2]; + vst4q_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d); + dest += 16; + src += 16 * 3; + } + } else { + while (dest < pVecEndDst) { + uint8x16x3_t s = vld3q_u8((uint8_t *)__builtin_assume_aligned(src, 8)); + d.val[2] = s.val[0]; + d.val[1] = s.val[1]; + d.val[0] = s.val[2]; + vst4q_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d); + dest += 16; + src += 16 * 3; + } + } + + if(dest < pEndDst) { + TVPConvert24BitTo32Bit_c(dest, src, pEndDst - dest); + } +} + +static void TVPConvert32BitTo24Bit_NEON(tjs_uint8 *dest, const tjs_uint8 *src, tjs_int len) { + const tjs_uint8* pEndSrc = src + len; + { + tjs_int PreFragLen = (-(((tjs_int)(intptr_t)src) & 7)) & 7; + if (PreFragLen > len) PreFragLen = len; + const tjs_uint8 *pend = src + PreFragLen; // in bytes + while (src < pend) + { + dest[0] = src[0]; + dest[1] = src[1]; + dest[2] = src[2]; + dest += 3; + src += 4; + } + } + + const tjs_uint8* pVecEndSrc = pEndSrc - 15; + uint8x16x3_t d; + if (((intptr_t)dest) & 7) { + while (src < pVecEndSrc) { + uint8x16x4_t s = vld4q_u8((uint8_t *)__builtin_assume_aligned(src, 8)); + d.val[0] = s.val[0]; + d.val[1] = s.val[1]; + d.val[2] = s.val[2]; + vst3q_u8((uint8_t *)__builtin_assume_aligned(dest, 4), d); + dest += 16 * 3; + src += 16 * 4; + } + } else { + while (src < pVecEndSrc) { + uint8x16x4_t s = vld4q_u8((uint8_t *)__builtin_assume_aligned(src, 8)); + d.val[0] = s.val[0]; + d.val[1] = s.val[1]; + d.val[2] = s.val[2]; + vst3q_u8((uint8_t *)__builtin_assume_aligned(dest, 8), d); + dest += 16 * 3; + src += 16 * 4; + } + } + + while (src < pEndSrc) { + dest[0] = src[0]; + dest[1] = src[1]; + dest[2] = src[2]; + dest += 3; + src += 4; + } +} + +static uint8x8x4_t do_GrayScale(uint8x8x4_t s) { + uint8x8_t const_19 = vdup_n_u8(19), const_183 = vdup_n_u8(183), const_54 = vdup_n_u8(54); + uint16x8_t r = vmull_u8(s.val[0], const_19); + uint16x8_t g = vmull_u8(s.val[1], const_183); + uint16x8_t b = vmull_u8(s.val[2], const_54); + r = vaddq_u16(r, g); + r = vaddq_u16(r, b); + s.val[2] = s.val[1] = s.val[0] = vshrn_n_u16(r, 8); + return s; +} +static void TVPDoGrayScale_NEON(tjs_uint32 *dest, tjs_int len) { + do_apply_pixel(TVPDoGrayScale_c, do_GrayScale, dest, len); +} +#ifndef Region_PsBlend +template +uint8x8x4_t do_PsAlphaBlend_so(uint8x8x4_t s, uint8x8x4_t d, tjs_int opa) { + uint8x8_t opa8 = vdup_n_u8(opa); + uint16x8_t a = vmull_u8(s.val[3], opa8); + s.val[3] = vshrn_n_u16(a, 8); + return op_func(s, d); +} +static void TVPPsAlphaBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPPsAlphaBlend_HDA_c, do_AlphaBlend, dest, src, len); +} +static void TVPPsAlphaBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPPsAlphaBlend_HDA_o_c, do_PsAlphaBlend_so, dest, src, len, opa); +} + +static uint8x8x4_t do_PsAddBlend(uint8x8x4_t s, uint8x8x4_t d) { + s.val[2] = vqadd_u8(s.val[2], d.val[2]); + s.val[1] = vqadd_u8(s.val[1], d.val[1]); + s.val[0] = vqadd_u8(s.val[0], d.val[0]); + return do_AlphaBlend(s, d); +} +static void TVPPsAddBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPPsAddBlend_HDA_c, do_PsAddBlend, dest, src, len); +} +static void TVPPsAddBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPPsAddBlend_HDA_o_c, do_PsAlphaBlend_so, dest, src, len, opa); +} + +static uint8x8x4_t do_PsSubBlend(uint8x8x4_t s, uint8x8x4_t d) { + s.val[2] = vqsub_u8(d.val[2], vmvn_u8(s.val[2])); + s.val[1] = vqsub_u8(d.val[1], vmvn_u8(s.val[1])); + s.val[0] = vqsub_u8(d.val[0], vmvn_u8(s.val[0])); + return do_AlphaBlend(s, d); +} +static void TVPPsSubBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPPsSubBlend_HDA_c, do_PsSubBlend, dest, src, len); +} +static void TVPPsSubBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPPsSubBlend_HDA_o_c, do_PsAlphaBlend_so, dest, src, len, opa); +} + +static uint8x8x4_t do_PsMulBlend(uint8x8x4_t s, uint8x8x4_t d) { + s.val[2] = vshrn_n_u16(vmull_u8(s.val[2], d.val[2]), 8); + s.val[1] = vshrn_n_u16(vmull_u8(s.val[1], d.val[1]), 8); + s.val[0] = vshrn_n_u16(vmull_u8(s.val[0], d.val[0]), 8); + return do_AlphaBlend(s, d); +} +static void TVPPsMulBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPPsMulBlend_HDA_c, do_PsMulBlend, dest, src, len); +} +static void TVPPsMulBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPPsMulBlend_HDA_o_c, do_PsAlphaBlend_so, dest, src, len, opa); +} + +static uint8x8x4_t do_PsScreenBlend(uint8x8x4_t s, uint8x8x4_t d) { + uint16x8_t d_r16 = vmull_u8(s.val[2], d.val[2]); + uint16x8_t d_g16 = vmull_u8(s.val[1], d.val[1]); + uint16x8_t d_b16 = vmull_u8(s.val[0], d.val[0]); + d_r16 = vsubl_u8(s.val[2], vshrn_n_u16(d_r16, 8)); + d_g16 = vsubl_u8(s.val[1], vshrn_n_u16(d_g16, 8)); + d_b16 = vsubl_u8(s.val[0], vshrn_n_u16(d_b16, 8)); + d_r16 = vmulq_u16(d_r16, vmovl_u8(s.val[3])); + d_g16 = vmulq_u16(d_g16, vmovl_u8(s.val[3])); + d_b16 = vmulq_u16(d_b16, vmovl_u8(s.val[3])); + d.val[2] = vadd_u8(d.val[2], vshrn_n_u16(d_r16, 8)); + d.val[1] = vadd_u8(d.val[1], vshrn_n_u16(d_g16, 8)); + d.val[0] = vadd_u8(d.val[0], vshrn_n_u16(d_b16, 8)); + return d; +} +static void TVPPsScreenBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPPsScreenBlend_HDA_c, do_PsScreenBlend, dest, src, len); +} +static void TVPPsScreenBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPPsScreenBlend_HDA_o_c, do_PsAlphaBlend_so, dest, src, len, opa); +} + +static uint8x8x4_t do_PsOverlayBlend(uint8x8x4_t s, uint8x8x4_t d) { + // (s+d-s*d)*2-1, d>=128 + // s*d*2, d<128 + const uint8x8_t mask80 = vdup_n_u8(0x80); + const uint8x8_t mask1 = vdup_n_u8(0x1); + const uint8x8_t maskFE = vdup_n_u8(0xFE); + for (int i = 0; i < 3; ++i) { + uint16x8_t sa = vmull_u8(vorr_u8(s.val[i], mask1), d.val[i]); + uint8x8_t n = vtst_u8(d.val[i], mask80); // n = d>=128 + uint8x8_t d1 = vand_u8(d.val[i], n), s1 = vand_u8(vand_u8(s.val[i], n), maskFE); + sa = vshrq_n_u16(sa, 7); + uint16x8_t t = vshll_n_u8(vadd_u8(s1, d1), 1); + t = vsubw_u8(t, n); + t = vsubq_u16(t, sa); + s.val[i] = vand_u8(vmovn_u16(t), n); + s.val[i] = vorr_u8(s.val[i], vand_u8(vmovn_u16(sa), vmvn_u8(n))); + +// uint8x8_t threshold = vtst_u8(d.val[i], mask80); // = (128<=s)?0xFF:0 +// uint16x8_t ms2 = vshlq_n_u16(vaddl_u8(s.val[i], d.val[i]), 1); // = (dst+src)*2 +// uint16x8_t ms = vshrq_n_u16(vmull_u8(s.val[i], d.val[i]), 7); // = dst*src*2/255 +// ms2 = vsubw_u8(vsubq_u16(ms2, ms), threshold); // = (d+s-d*s)*2-255 +// s.val[i] = vand_u8(vmovn_u16(ms2), threshold); +// threshold = vmvn_u8(threshold); +// threshold = vand_u8(vmovn_u16(ms), threshold); +// s.val[i] = vorr_u8(s.val[i], threshold); + } + + return do_AlphaBlend(s, d); +} +static void TVPPsOverlayBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPPsOverlayBlend_HDA_c, do_PsOverlayBlend, dest, src, len); +} +static void TVPPsOverlayBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPPsOverlayBlend_HDA_o_c, do_PsAlphaBlend_so, dest, src, len, opa); +} + +static uint8x8x4_t do_PsHardLightBlend(uint8x8x4_t s, uint8x8x4_t d) { + // (s+d-s*d)*2-1, s>=128 + // s*d*2, s<128 + const uint8x8_t mask80 = vdup_n_u8(0x80); + const uint8x8_t mask1 = vdup_n_u8(0x1); + const uint8x8_t maskFE = vdup_n_u8(0xFE); + for (int i = 0; i < 3; ++i) { + uint16x8_t sa = vmull_u8(vorr_u8(s.val[i], mask1), d.val[i]); + uint8x8_t n = vtst_u8(s.val[i], mask80); // n = d>=128 + uint8x8_t d1 = vand_u8(d.val[i], n), s1 = vand_u8(vand_u8(s.val[i], n), maskFE); + sa = vshrq_n_u16(sa, 7); + uint16x8_t t = vshll_n_u8(vadd_u8(s1, d1), 1); + t = vsubw_u8(t, n); + t = vsubq_u16(t, sa); + s.val[i] = vand_u8(vmovn_u16(t), n); + s.val[i] = vorr_u8(s.val[i], vand_u8(vmovn_u16(sa), vmvn_u8(n))); + +// uint8x8_t threshold = vtst_u8(s.val[i], mask80); // = (128<=s)?0xFF:0 +// uint16x8_t ms2 = vshlq_n_u16(vaddl_u8(s.val[i], d.val[i]), 1); // = (dst+src)*2 +// uint16x8_t ms = vshrq_n_u16(vmull_u8(s.val[i], d.val[i]), 7); // = dst*src*2/255 +// ms2 = vqsubq_u16(vsubq_u16(ms2, ms), maskFF); // = (d+s-d*s)*2-255 +// s.val[i] = vand_u8(vmovn_u16(ms2), threshold); +// threshold = vmvn_u8(threshold); +// threshold = vand_u8(vmovn_u16(ms), threshold); +// s.val[i] = vorr_u8(s.val[i], threshold); + } + + return do_AlphaBlend(s, d); +} +static void TVPPsHardLightBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPPsHardLightBlend_HDA_c, do_PsHardLightBlend, dest, src, len); +} +static void TVPPsHardLightBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPPsHardLightBlend_HDA_o_c, do_PsAlphaBlend_so, dest, src, len, opa); +} + +static uint8x8x4_t do_PsLightenBlend(uint8x8x4_t s, uint8x8x4_t d) { + s.val[2] = vmax_u8(s.val[2], d.val[2]); + s.val[1] = vmax_u8(s.val[1], d.val[1]); + s.val[0] = vmax_u8(s.val[0], d.val[0]); + return do_AlphaBlend(s, d); +} +static void TVPPsLightenBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPPsLightenBlend_HDA_c, do_PsLightenBlend, dest, src, len); +} +static void TVPPsLightenBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPPsLightenBlend_HDA_o_c, do_PsAlphaBlend_so, dest, src, len, opa); +} + +static uint8x8x4_t do_PsDarkenBlend(uint8x8x4_t s, uint8x8x4_t d) { + s.val[2] = vmin_u8(s.val[2], d.val[2]); + s.val[1] = vmin_u8(s.val[1], d.val[1]); + s.val[0] = vmin_u8(s.val[0], d.val[0]); + return do_AlphaBlend(s, d); +} +static void TVPPsDarkenBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPPsDarkenBlend_HDA_c, do_PsDarkenBlend, dest, src, len); +} +static void TVPPsDarkenBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPPsDarkenBlend_HDA_o_c, do_PsAlphaBlend_so, dest, src, len, opa); +} + +static uint8x8x4_t do_PsDiff5Blend(uint8x8x4_t s, uint8x8x4_t d) { + uint16x8_t s_r16 = vmull_u8(s.val[2], s.val[3]); + uint16x8_t s_g16 = vmull_u8(s.val[1], s.val[3]); + uint16x8_t s_b16 = vmull_u8(s.val[0], s.val[3]); + d.val[2] = vabd_u8(vshrn_n_u16(s_r16, 8), d.val[2]); + d.val[1] = vabd_u8(vshrn_n_u16(s_g16, 8), d.val[1]); + d.val[0] = vabd_u8(vshrn_n_u16(s_b16, 8), d.val[0]); + return d; +} +static void TVPPsDiff5Blend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPPsDiff5Blend_HDA_c, do_PsDiff5Blend, dest, src, len); +} +static void TVPPsDiff5Blend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPPsDiff5Blend_HDA_o_c, do_PsAlphaBlend_so, dest, src, len, opa); +} + +static uint8x8x4_t do_PsDiffBlend(uint8x8x4_t s, uint8x8x4_t d) { + s.val[2] = vabd_u8(s.val[2], d.val[2]); + s.val[1] = vabd_u8(s.val[1], d.val[1]); + s.val[0] = vabd_u8(s.val[0], d.val[0]); + return do_AlphaBlend(s, d); +} +static void TVPPsDiffBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPPsDiffBlend_HDA_c, do_PsDiffBlend, dest, src, len); +} +static void TVPPsDiffBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPPsDiffBlend_HDA_o_c, do_PsAlphaBlend_so, dest, src, len, opa); +} + +static uint8x8x4_t do_PsExclusionBlend(uint8x8x4_t s, uint8x8x4_t d) { + // c = ((s+d-(s*d*2)/255)-d)*a + d = (s-(s*d*2)/255)*a + d + uint16x8_t d_r16 = vmull_u8(s.val[2], d.val[2]); + uint16x8_t d_g16 = vmull_u8(s.val[1], d.val[1]); + uint16x8_t d_b16 = vmull_u8(s.val[0], d.val[0]); + d_r16 = vsubq_u16(vmovl_u8(s.val[2]), vshrq_n_u16(d_r16, 7)); + d_g16 = vsubq_u16(vmovl_u8(s.val[1]), vshrq_n_u16(d_g16, 7)); + d_b16 = vsubq_u16(vmovl_u8(s.val[0]), vshrq_n_u16(d_b16, 7)); + d_r16 = vmulq_u16(d_r16, vmovl_u8(s.val[3])); + d_g16 = vmulq_u16(d_g16, vmovl_u8(s.val[3])); + d_b16 = vmulq_u16(d_b16, vmovl_u8(s.val[3])); + d.val[2] = vadd_u8(d.val[2], vshrn_n_u16(d_r16, 8)); + d.val[1] = vadd_u8(d.val[1], vshrn_n_u16(d_g16, 8)); + d.val[0] = vadd_u8(d.val[0], vshrn_n_u16(d_b16, 8)); + return d; +} +static void TVPPsExclusionBlend_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len) { + do_blend(TVPPsExclusionBlend_HDA_c, do_PsExclusionBlend, dest, src, len); +} +static void TVPPsExclusionBlend_o_NEON(tjs_uint32 *dest, const tjs_uint32 *src, tjs_int len, tjs_int opa) { + do_blend(TVPPsExclusionBlend_HDA_o_c, do_PsAlphaBlend_so, dest, src, len, opa); +} +#endif + +#if TVP_TLG6_W_BLOCK_SIZE != 8 +#error TVP_TLG6_W_BLOCK_SIZE must be 8 ! +#endif + +static uint8x8x4_t filter_insts_0_neon(uint8x8x4_t m) { return m; } +// ( 1, IB+IG, IG, IR+IG) +static uint8x8x4_t filter_insts_1_neon(uint8x8x4_t m) { + m.val[0] = vadd_u8(m.val[0], m.val[1]); + m.val[2] = vadd_u8(m.val[2], m.val[1]); + return m; +} +// ( 2, IB, IG+IB, IR+IB+IG) +static uint8x8x4_t filter_insts_2_neon(uint8x8x4_t m) { + m.val[1] = vadd_u8(m.val[1], m.val[0]); + m.val[2] = vadd_u8(m.val[2], m.val[1]); + return m; +} +// ( 3, IB+IR+IG, IG+IR, IR) +static uint8x8x4_t filter_insts_3_neon(uint8x8x4_t m) { + m.val[1] = vadd_u8(m.val[1], m.val[2]); + m.val[0] = vadd_u8(m.val[0], m.val[1]); + return m; +} +// ( 4, IB+IR, IG+IB+IR, IR+IB+IR+IG) +static uint8x8x4_t filter_insts_4_neon(uint8x8x4_t m) { + m.val[0] = vadd_u8(m.val[0], m.val[2]); + m.val[1] = vadd_u8(m.val[1], m.val[0]); + m.val[2] = vadd_u8(m.val[2], m.val[1]); + return m; +} +// ( 5, IB+IR, IG+IB+IR, IR) +static uint8x8x4_t filter_insts_5_neon(uint8x8x4_t m) { + m.val[0] = vadd_u8(m.val[0], m.val[2]); + m.val[1] = vadd_u8(m.val[1], m.val[0]); + return m; +} +// ( 6, IB+IG, IG, IR) +static uint8x8x4_t filter_insts_6_neon(uint8x8x4_t m) { + m.val[0] = vadd_u8(m.val[0], m.val[1]); + return m; +} +// ( 7, IB, IG+IB, IR) +static uint8x8x4_t filter_insts_7_neon(uint8x8x4_t m) { + m.val[1] = vadd_u8(m.val[1], m.val[0]); + return m; +} +// ( 8, IB, IG, IR+IG) +static uint8x8x4_t filter_insts_8_neon(uint8x8x4_t m) { + m.val[2] = vadd_u8(m.val[2], m.val[1]); + return m; +} +// ( 9, IB+IG+IR+IB, IG+IR+IB, IR+IB) +static uint8x8x4_t filter_insts_9_neon(uint8x8x4_t m) { + m.val[2] = vadd_u8(m.val[2], m.val[0]); + m.val[1] = vadd_u8(m.val[1], m.val[2]); + m.val[0] = vadd_u8(m.val[0], m.val[1]); + return m; +} +// (10, IB+IR, IG+IR, IR) +static uint8x8x4_t filter_insts_10_neon(uint8x8x4_t m) { + m.val[0] = vadd_u8(m.val[0], m.val[2]); + m.val[1] = vadd_u8(m.val[1], m.val[2]); + return m; +} +// (11, IB, IG+IB, IR+IB) +static uint8x8x4_t filter_insts_11_neon(uint8x8x4_t m) { + m.val[1] = vadd_u8(m.val[1], m.val[0]); + m.val[2] = vadd_u8(m.val[2], m.val[0]); + return m; +} +// (12, IB, IG+IR+IB, IR+IB) +static uint8x8x4_t filter_insts_12_neon(uint8x8x4_t m) { + m.val[2] = vadd_u8(m.val[2], m.val[0]); + m.val[1] = vadd_u8(m.val[1], m.val[2]); + return m; +} +// (13, IB+IG, IG+IR+IB+IG, IR+IB+IG) +static uint8x8x4_t filter_insts_13_neon(uint8x8x4_t m) { + m.val[0] = vadd_u8(m.val[0], m.val[1]); + m.val[2] = vadd_u8(m.val[2], m.val[0]); + m.val[1] = vadd_u8(m.val[1], m.val[2]); + return m; +} +// (14, IB+IG+IR, IG+IR, IR+IB+IG+IR) +static uint8x8x4_t filter_insts_14_neon(uint8x8x4_t m) { + m.val[1] = vadd_u8(m.val[1], m.val[2]); + m.val[0] = vadd_u8(m.val[0], m.val[1]); + m.val[2] = vadd_u8(m.val[2], m.val[0]); + return m; +} +// (15, IB, IG+(IB<<1), IR+(IB<<1)) +static uint8x8x4_t filter_insts_15_neon(uint8x8x4_t m) { + uint8x8_t t = vshl_n_u8(m.val[0], 1); + m.val[1] = vadd_u8(m.val[1], t); + m.val[2] = vadd_u8(m.val[2], t); + return m; +} + +static uint8x8x4_t tlg6_forward_input_neon(uint32_t* in) { + return vld4_u8((uint8_t*)__builtin_assume_aligned(in, 8)); +} + +static uint8x8x4_t tlg6_backward_input_neon(uint32_t* in) { + uint8x8x4_t ret = vld4_u8((uint8_t*)__builtin_assume_aligned(in, 8)); + ret.val[0] = vrev64_u8(ret.val[0]); + ret.val[1] = vrev64_u8(ret.val[1]); + ret.val[2] = vrev64_u8(ret.val[2]); + ret.val[3] = vrev64_u8(ret.val[3]); + return ret; +} + +static inline uint8x8x4_t do_unpack_pixel_rgba(uint8x8x4_t minput) { + // BGRA -> RGBA + minput.val[0] = veor_u8(minput.val[0], minput.val[2]); + minput.val[2] = veor_u8(minput.val[2], minput.val[0]); + minput.val[0] = veor_u8(minput.val[0], minput.val[2]); + + uint8x8x2_t m01 = vtrn_u8(minput.val[0], minput.val[1]); + uint8x8x2_t m23 = vtrn_u8(minput.val[2], minput.val[3]); + uint16x8x2_t m = vtrnq_u16( + vreinterpretq_u16_u8(vcombine_u8(m01.val[0], m01.val[1])), + vreinterpretq_u16_u8(vcombine_u8(m23.val[0], m23.val[1]))); + minput.val[0] = vreinterpret_u8_u16(vget_low_u16(m.val[0])); + minput.val[1] = vreinterpret_u8_u16(vget_high_u16(m.val[0])); + minput.val[2] = vreinterpret_u8_u16(vget_low_u16(m.val[1])); + minput.val[3] = vreinterpret_u8_u16(vget_high_u16(m.val[1])); + return minput; +} + +/* + +---+---+ + |lt | t | / min(l, t), if lt >= max(l, t); + +---+---+ ret = | max(l, t), if lt >= min(l, t); + | l |ret| \ l + t - lt, otherwise; + +---+---+ +*/ +static inline uint8x8_t do_med_neon(uint8x8_t a, uint8x8_t b, const uint8x8_t& c, uint8x8_t v) { + uint8x8_t a2 = a; + a = vmax_u8(a, b); // = max_a_b + b = vmin_u8(b, a2); // = min_a_b + v = vadd_u8(v, a); + a = vmin_u8(a, c); // = max_a_b < c ? max_a_b : c + v = vadd_u8(v, b); + a = vmax_u8(a, b); // = min_a_b < a ? a : min_a_b + return vsub_u8(v, a); +} + +#define vshr_n_u8_64(s, n) vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(s), n)) + +template< + uint8x8x4_t filter(uint8x8x4_t), + uint8x8x4_t input(uint32_t *)> +static inline void do_filter_med_neon(uint32_t& inp, uint32_t& inup, uint32_t *in, uint32_t *prevline, uint32_t *curline) { + uint8x8_t p = vreinterpret_u8_u32(vdup_n_u32(inp)); + uint8x8_t up = vreinterpret_u8_u32(vdup_n_u32(inup)); + uint8x8x4_t minput = input(in); + minput = filter(minput); + minput = do_unpack_pixel_rgba(minput); + uint8x8_t u; + + u = vld1_u8((uint8_t*)__builtin_assume_aligned(prevline, 8)); + p = do_med_neon(p, u, up, minput.val[0]); + up = u; + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + + u = vshr_n_u8_64(u, 32); + p = do_med_neon(p, u, up, minput.val[1]); + up = u; + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + prevline += 2; + + u = vld1_u8((uint8_t*)__builtin_assume_aligned(prevline, 8)); + p = do_med_neon(p, u, up, minput.val[2]); + up = u; + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + + u = vshr_n_u8_64(u, 32); + p = do_med_neon(p, u, up, minput.val[3]); + up = u; + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + prevline += 2; + + u = vld1_u8((uint8_t*)__builtin_assume_aligned(prevline, 8)); + p = do_med_neon(p, u, up, vshr_n_u8_64(minput.val[0], 32)); + up = u; + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + + u = vshr_n_u8_64(u, 32); + p = do_med_neon(p, u, up, vshr_n_u8_64(minput.val[1], 32)); + up = u; + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + prevline += 2; + + u = vld1_u8((uint8_t*)__builtin_assume_aligned(prevline, 8)); + p = do_med_neon(p, u, up, vshr_n_u8_64(minput.val[2], 32)); + up = u; + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + + u = vshr_n_u8_64(u, 32); + p = do_med_neon(p, u, up, vshr_n_u8_64(minput.val[3], 32)); + up = u; + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + prevline += 2; + + inp = curline[-1]; + inup = vget_lane_u32(vreinterpret_u32_u8(u), 0); +} + +static inline uint8x8_t do_avg_neon(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &v) { + return vadd_u8(vrhadd_u8(a, b), v); +} + +template< + uint8x8x4_t filter(uint8x8x4_t), + uint8x8x4_t input(uint32_t *)> +inline void do_filter_avg_neon(tjs_uint32& inp, tjs_uint32& up, tjs_uint32 *in, tjs_uint32 *prevline, tjs_uint32 *curline) { + uint8x8_t p = vreinterpret_u8_u32(vdup_n_u32(inp)); + uint8x8x4_t minput = input(in); + minput = filter(minput); + minput = do_unpack_pixel_rgba(minput); + uint8x8_t u; + + u = vld1_u8((uint8_t*)__builtin_assume_aligned(prevline, 8)); + p = do_avg_neon(p, u, minput.val[0]); + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + + u = vshr_n_u8_64(u, 32); + p = do_avg_neon(p, u, minput.val[1]); + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + prevline += 2; + + u = vld1_u8((uint8_t*)__builtin_assume_aligned(prevline, 8)); + p = do_avg_neon(p, u, minput.val[2]); + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + + u = vshr_n_u8_64(u, 32); + p = do_avg_neon(p, u, minput.val[3]); + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + prevline += 2; + + u = vld1_u8((uint8_t*)__builtin_assume_aligned(prevline, 8)); + p = do_avg_neon(p, u, vshr_n_u8_64(minput.val[0], 32)); + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + + u = vshr_n_u8_64(u, 32); + p = do_avg_neon(p, u, vshr_n_u8_64(minput.val[1], 32)); + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + prevline += 2; + + u = vld1_u8((uint8_t*)__builtin_assume_aligned(prevline, 8)); + p = do_avg_neon(p, u, vshr_n_u8_64(minput.val[2], 32)); + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + + u = vshr_n_u8_64(u, 32); + p = do_avg_neon(p, u, vshr_n_u8_64(minput.val[3], 32)); + *curline++ = vget_lane_u32(vreinterpret_u32_u8(p), 0); + prevline += 2; + + inp = curline[-1]; + up = vget_lane_u32(vreinterpret_u32_u8(u), 0); +} + +/* + chroma/luminosity decoding + (this does reordering, color correlation filter, MED/AVG at a time) +*/ +static void TVPTLG6DecodeLine_NEON(tjs_uint32 *prevline, tjs_uint32 *curline, tjs_int width/*, tjs_int start_block*/, tjs_int block_limit, tjs_uint8 *filtertypes, tjs_int skipblockbytes, tjs_uint32 *in, tjs_uint32 initialp, tjs_int oddskip, tjs_int dir) +{ +// std::vector tmp; tmp.resize(TVP_TLG6_W_BLOCK_SIZE * (block_limit - start_block)); tjs_uint32* _curline = curline; +// TVPTLG6DecodeLine_c(prevline, &tmp.front(), width, start_block, block_limit, filtertypes, skipblockbytes, in, initialp, oddskip, dir); + tjs_int start_block = 0; + uint32_t p, up; + + if(start_block) + { + prevline += start_block * TVP_TLG6_W_BLOCK_SIZE; + curline += start_block * TVP_TLG6_W_BLOCK_SIZE; + p = curline[-1]; + up = prevline[-1]; + } + else + { + p = up = initialp; + } + + oddskip *= TVP_TLG6_W_BLOCK_SIZE; // oddskip * 8 + if (dir & 1) { + // forward + skipblockbytes -= TVP_TLG6_W_BLOCK_SIZE; + in += skipblockbytes * start_block; + in += oddskip; + for (int i = start_block; i < block_limit; i++) { + if (i & 1) { + in += oddskip; + } else { + in -= oddskip; + } + switch (filtertypes[i]) { +#define TVP_TLG6_DO_CHROMA_DECODE_FORWARD(N) \ + case (N<<1)+0: do_filter_med_neon(p, up, in, prevline, curline); break;\ + case (N<<1)+1: do_filter_avg_neon(p, up, in, prevline, curline); break; + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(0); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(1); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(2); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(3); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(4); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(5); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(6); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(7); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(8); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(9); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(10); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(11); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(12); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(13); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(14); + TVP_TLG6_DO_CHROMA_DECODE_FORWARD(15); +#undef TVP_TLG6_DO_CHROMA_DECODE_FORWARD + } + prevline += 8; curline += 8; in += 8; + in += skipblockbytes; + } + } else { + // backward + skipblockbytes += TVP_TLG6_W_BLOCK_SIZE; + in += skipblockbytes * start_block; + in += oddskip; + //in += (TVP_TLG6_W_BLOCK_SIZE - 1); + in += TVP_TLG6_W_BLOCK_SIZE; + for (int i = start_block; i < block_limit; i++) { + if (i & 1) { + in += oddskip; + } else{ + in -= oddskip; + } + in -= 8; + switch (filtertypes[i]) { +#define TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(N) \ + case (N<<1)+0: do_filter_med_neon(p, up, in, prevline, curline); break;\ + case (N<<1)+1: do_filter_avg_neon(p, up, in, prevline, curline); break; + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(0); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(1); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(2); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(3); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(4); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(5); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(6); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(7); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(8); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(9); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(10); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(11); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(12); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(13); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(14); + TVP_TLG6_DO_CHROMA_DECODE_BACKWARD(15); +#undef TVP_TLG6_DO_CHROMA_DECODE_BACKWARD + } + prevline += 8; curline += 8; + in += skipblockbytes; + } + } + +// for (int i = 0; i < tmp.size(); ++i) { +// assert(tmp[i] == _curline[i]); +// } +} + +static void TVPTLG5ComposeColors3To4_NEON(tjs_uint8 *outp, const tjs_uint8 *upper, tjs_uint8 * const * buf, tjs_int width) +{ + const tjs_uint8 * p2 = buf[0]; + const tjs_uint8 * p1 = buf[1]; + const tjs_uint8 * p0 = buf[2]; + int x = 0; + uint8x8x3_t pc; + pc.val[0] = vdup_n_u8(0); + pc.val[1] = vdup_n_u8(0); + pc.val[2] = vdup_n_u8(0); + uint8x8x4_t rgba; + rgba.val[3] = vdup_n_u8(0xFF); + for(x = 0; x < width - 7; x += 8) { + uint8x8x3_t c; + c.val[1] = vld1_u8(p1 + x); + c.val[0] = vadd_u8(vld1_u8(p0 + x), c.val[1]); + c.val[2] = vadd_u8(vld1_u8(p2 + x), c.val[1]); + pc.val[0] = vadd_u8(vdup_n_u8(vget_lane_u8(pc.val[0], 7)), c.val[0]); + pc.val[1] = vadd_u8(vdup_n_u8(vget_lane_u8(pc.val[1], 7)), c.val[1]); + pc.val[2] = vadd_u8(vdup_n_u8(vget_lane_u8(pc.val[2], 7)), c.val[2]); + for(int i = 0; i < 7; ++i) { + c.val[0] = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(c.val[0]), 8)); + c.val[1] = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(c.val[1]), 8)); + c.val[2] = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(c.val[2]), 8)); + pc.val[0] = vadd_u8(pc.val[0], c.val[0]); + pc.val[1] = vadd_u8(pc.val[1], c.val[1]); + pc.val[2] = vadd_u8(pc.val[2], c.val[2]); + } + uint8x8x4_t up = vld4_u8((uint8_t*)__builtin_assume_aligned(upper, 8)); + rgba.val[0] = vadd_u8(pc.val[0], up.val[0]); + rgba.val[1] = vadd_u8(pc.val[1], up.val[1]); + rgba.val[2] = vadd_u8(pc.val[2], up.val[2]); + vst4_u8((uint8_t*)__builtin_assume_aligned(outp, 8), rgba); + outp += 4 * 8; + upper += 4 * 8; + } + + if(x < width) { + tjs_uint8 _pc[3]; + tjs_uint8 _c[3]; + _pc[0] = vget_lane_u8(pc.val[0], 7); + _pc[1] = vget_lane_u8(pc.val[1], 7); + _pc[2] = vget_lane_u8(pc.val[2], 7); + for(; x < width; x++) + { + _c[0] = p0[x]; + _c[1] = p1[x]; + _c[2] = p2[x]; + _c[0] += _c[1]; _c[2] += _c[1]; + *(tjs_uint32 *)outp = + ((((_pc[0] += _c[0]) + upper[0]) & 0xff) ) + + ((((_pc[1] += _c[1]) + upper[1]) & 0xff) << 8) + + ((((_pc[2] += _c[2]) + upper[2]) & 0xff) << 16) + + 0xff000000; + outp += 4; + upper += 4; + } + } +} + +static void TVPTLG5ComposeColors4To4_NEON(tjs_uint8 *outp, const tjs_uint8 *upper, tjs_uint8 * const * buf, tjs_int width) +{ +#ifdef TEST_ARM_NEON_CODE + TVPTLG5ComposeColors4To4_c(outp, upper, buf, width); + tjs_uint8 *orig_outp = outp; + tjs_uint8 *test_outp = outp = new tjs_uint8[width * 4]; +#endif + const tjs_uint8 * p2 = buf[0]; + const tjs_uint8 * p1 = buf[1]; + const tjs_uint8 * p0 = buf[2]; + const tjs_uint8 * p3 = buf[3]; + int x = 0; + uint8x8x4_t pc; + pc.val[0] = vdup_n_u8(0); + pc.val[1] = vdup_n_u8(0); + pc.val[2] = vdup_n_u8(0); + pc.val[3] = vdup_n_u8(0); + for(x = 0; x < width - 7; x += 8) { + uint8x8x4_t c; + c.val[1] = vld1_u8(p1 + x); + c.val[0] = vadd_u8(vld1_u8(p0 + x), c.val[1]); + c.val[2] = vadd_u8(vld1_u8(p2 + x), c.val[1]); + c.val[3] = vld1_u8(p3 + x); + pc.val[0] = vadd_u8(vdup_n_u8(vget_lane_u8(pc.val[0], 7)), c.val[0]); + pc.val[1] = vadd_u8(vdup_n_u8(vget_lane_u8(pc.val[1], 7)), c.val[1]); + pc.val[2] = vadd_u8(vdup_n_u8(vget_lane_u8(pc.val[2], 7)), c.val[2]); + pc.val[3] = vadd_u8(vdup_n_u8(vget_lane_u8(pc.val[3], 7)), c.val[3]); + for(int i = 0; i < 7; ++i) { + c.val[0] = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(c.val[0]), 8)); + c.val[1] = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(c.val[1]), 8)); + c.val[2] = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(c.val[2]), 8)); + c.val[3] = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(c.val[3]), 8)); + pc.val[0] = vadd_u8(pc.val[0], c.val[0]); + pc.val[1] = vadd_u8(pc.val[1], c.val[1]); + pc.val[2] = vadd_u8(pc.val[2], c.val[2]); + pc.val[3] = vadd_u8(pc.val[3], c.val[3]); + } + uint8x8x4_t up = vld4_u8((uint8_t*)__builtin_assume_aligned(upper, 8)); + uint8x8x4_t rgba; + rgba.val[0] = vadd_u8(pc.val[0], up.val[0]); + rgba.val[1] = vadd_u8(pc.val[1], up.val[1]); + rgba.val[2] = vadd_u8(pc.val[2], up.val[2]); + rgba.val[3] = vadd_u8(pc.val[3], up.val[3]); + vst4_u8((uint8_t*)__builtin_assume_aligned(outp, 8), rgba); + outp += 4 * 8; + upper += 4 * 8; + } + + if(x < width) { + tjs_uint8 _pc[4]; + tjs_uint8 _c[4]; + _pc[0] = vget_lane_u8(pc.val[0], 7); + _pc[1] = vget_lane_u8(pc.val[1], 7); + _pc[2] = vget_lane_u8(pc.val[2], 7); + _pc[3] = vget_lane_u8(pc.val[3], 7); + for(; x < width; x++) + { + _c[0] = p0[x]; + _c[1] = p1[x]; + _c[2] = p2[x]; + _c[3] = p3[x]; + _c[0] += _c[1]; _c[2] += _c[1]; + *(tjs_uint32 *)outp = + ((((_pc[0] += _c[0]) + upper[0]) & 0xff) ) + + ((((_pc[1] += _c[1]) + upper[1]) & 0xff) << 8) + + ((((_pc[2] += _c[2]) + upper[2]) & 0xff) << 16) + + ((((_pc[3] += _c[3]) + upper[3]) & 0xff) << 24); + outp += 4; + upper += 4; + } + } +#ifdef TEST_ARM_NEON_CODE + for (int i = 0; i < width * 4; ++i) { + assert(test_outp[i] == orig_outp[i]); + } + delete[]test_outp; +#endif +} + +static tjs_int TVPTLG5DecompressSlide_NEON(tjs_uint8 *out, const tjs_uint8 *in, tjs_int insize, tjs_uint8 *text, tjs_int initialr) { + // test +// std::vector tmp; tmp.resize(1024 * 768 * 4); +// std::vector ttext; ttext.insert(ttext.begin(), text, text + 4096 + 16); +// tjs_uint8 *pout = out; +// tjs_int rr = TVPTLG5DecompressSlide_c(&tmp[0], in, insize, &ttext[0], initialr); + + tjs_int r = initialr; + tjs_uint flags = 0; + const tjs_uint8 *inlim = in + insize; + while (in < inlim) { + if (((flags >>= 1) & 256) == 0) { + flags = in[0] | 0xff00; + in++; + if (flags == 0xff00 && r < (4096 - 8) && in < (inlim - 8)) { // copy 8byte + uint8x8_t c = vld1_u8(in); + vst1_u8(out, c);; + vst1_u8(&text[r], c);; + r += 8; + in += 8; + out += 8; + flags = 0; + continue; + } + } + if (flags & 1) { + tjs_uint16 in16 = *(tjs_uint16*)in; + tjs_uint mpos = in16 & 0xFFF; + tjs_uint mlen = (in16 >> 12) + 3; + in += 2; + if (mlen == 18) + mlen += *in++; + if (mlen > 15 && (mpos - r > 15 || r - mpos > 15)) { + if ((mpos + mlen) < 4096 && (r + mlen) < 4096) { + tjs_int count = mlen >> 4; + while (count--) { + uint8x16_t c = vld1q_u8(&text[mpos]); + vst1q_u8(out, c); + vst1q_u8(&text[r], c); + mpos += 16; r += 16; out += 16; + } + mlen &= 0x0f; // モ爨 + while (mlen--) { + out[0] = text[r++] = text[mpos++]; out++; + } + continue; + } +#if 0 + while (mlen) { + uint8x16_t c = vld1q_u8(&text[mpos]); + vst1q_u8(out, c); + vst1q_u8(&text[r], c); // direct write to text is OK due to the extra 16 bytes + tjs_int next = mlen < 16 ? mlen : 16; + if (mpos + next > 4095) { + next = 4096 - mpos; + mpos = 0; + } else { + mpos += next; + } + out += next; + r += next; + mlen -= next; + if (r > 4095) { + r &= 0x0fff; + vst1q_u8(&text[r - 16], c); + } + } + continue; +#endif + } + while (mlen--) { + out[0] = text[r++] = text[mpos++]; out++; + mpos &= 0x0fff; + r &= 0x0fff; + } + } else { + unsigned char c = in[0]; in++; + out[0] = c; out++; + text[r++] = c; + r &= 0x0fff; + } + } + + // test +// assert(rr == r); +// for (int i = 0; i < out - pout; ++i) { +// assert(tmp[i] == pout[i]); +// } + + return r; +} + +//#include + +static tjs_uint32 *testbuff = NULL; +static tjs_uint32 *testdata1 = NULL; +static tjs_uint32 *testdata2 = NULL; +static tjs_uint32 *testdest1 = NULL; +static tjs_uint32 *testdest2 = NULL; +static tjs_uint32 *testtable = NULL; +static tjs_uint8 *testrule = NULL; +#include +#include + +#ifdef __cplusplus +#define FUNC_API extern "C" +#else +#define FUNC_API +#endif +FUNC_API int TVPShowSimpleMessageBox(const char * text, const char * caption, unsigned int nButton, const char **btnText); // C-style +tjs_uint32 TVPGetRoughTickCount32(); + +static void ShowInMessageBox(const char *text) { + if (!text || !*text) return; + const char *btnText = "OK"; + TVPShowSimpleMessageBox(text, "Log", 1, &btnText); +} + +static void InitTestData() { + if(!testtable) { + testtable = (tjs_uint32*)malloc(256 * sizeof(tjs_uint32)); + for(int x = 0; x < 256; ++x) { + testtable[x] = rand() & 0xFF; + } + testrule = (tjs_uint8*)malloc(256 * 256); + for(int x = 0; x < 256 * 256; ++x) { + testrule[x] = rand() & 0xFF; + } + testbuff = (tjs_uint32*)malloc((256 * 256 * 4 + 2) * sizeof(tjs_uint32)); + testdest1 = testbuff; + testdest2 = testdest1 + 256 * 256; + testdata1 = testdest2 + 256 * 256; + testdata2 = testdata1 + 256 * 256; + } + int obfu = 65531; + for(int x = 0; x < 256; ++x) { + for(int y = 0; y < 256; ++y) { + typedef struct { + unsigned char a; + unsigned char r; + unsigned char g; + unsigned char b; + } clr; + clr *clr1 = (clr*)(testdata1 + 256 * y + x), + *clr2 = (clr*)(testdata2 + 256 * y + x); + clr1->a = 255 - x; clr2->a = 255 - y; + clr1->r = x; clr2->r = y; + clr1->g = y; clr2->g = 255 - x; + clr1->b = 255 - y; clr2->b = x; + if (y == 0) { + clr1->a = obfu; + obfu = obfu * 3 + 29; + clr2->a = obfu; + obfu = obfu * 3 + 29; + clr1->r = obfu; + obfu = obfu * 3 + 29; + clr2->r = obfu; + obfu = obfu * 3 + 29; + clr1->g = obfu; + obfu = obfu * 3 + 29; + clr2->g = obfu; + obfu = obfu * 3 + 29; + clr1->b = obfu; + obfu = obfu * 3 + 29; + clr2->b = obfu; + obfu = obfu * 3 + 29; + } + } + } + memcpy(testdest1, testdata2, 256 * 256 * 4); + memcpy(testdest2, testdata2, 256 * 256 * 4); +} + +#if defined(TEST_ARM_NEON_CODE) || defined(LOG_NEON_TEST) +static void CheckTestData(const char *pszFuncName) +{ + typedef union{ + struct { + unsigned char r; + unsigned char g; + unsigned char b; + unsigned char a; + }; + unsigned long u32; + } clr; clr clr1, clr2; + for (int i = 0; i < 256 * 256; ++i) { + clr1.u32 = testdest1[i]; + clr2.u32 = testdest2[i]; + if (clr1.a <= 1 && clr2.a <= 1) continue; + if (abs(clr1.a - clr2.a) > 2 || + abs(clr1.r - clr2.r) > 2 || + abs(clr1.g - clr2.g) > 2 || + abs(clr1.b - clr2.b) > 2) + { + char tmp[256]; + sprintf(tmp, "test fail on function %s", pszFuncName); +#ifdef _MSC_VER + cv::Mat test1(256, 256, CV_8UC4, testdest1, 1024); + cv::Mat test2(256, 256, CV_8UC4, testdest2, 1024); +#endif + ShowInMessageBox(tmp); +#if !defined(WIN32) && 0 + const char bmphdr[] = "\x42\x4D\x36\x00\x04\x00\x00\x00\x00\x00\x36\x00\x00\x00\x28\x00\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x01\x00\x20\x00\x00\x00\x00\x00\x00\x00\x00\x00\x12\x0B\x00\x00\x12\x0B\x00\x00\x00\x00\x00\x00\x00\x00\x00"; + FILE* f = fopen("/sdcard/testdest1.bmp", "wb"); + fwrite(bmphdr, sizeof(bmphdr), 1, f); + fwrite(testdest1, 256 * 256, 4, f); + fclose(f); + f = fopen("/sdcard/testdest2.bmp", "wb"); + fwrite(bmphdr, sizeof(bmphdr), 1, f); + fwrite(testdest2, 256 * 256, 4, f); + fclose(f); +#endif + return; + } + } + //SDL_Log("cheking %s pass", pszFuncName); +} +#endif +static void CheckTestData_RGB(const char *pszFuncName) +{ + typedef union{ + struct { + unsigned char r; + unsigned char g; + unsigned char b; + unsigned char a; + }; + unsigned long u32; + } clr; clr clr1, clr2; + for (int i = 0; i < 256 * 256; ++i) { + clr1.u32 = testdest1[i]; + clr2.u32 = testdest2[i]; + if (abs(clr1.r - clr2.r) > 2 || + abs(clr1.g - clr2.g) > 2 || + abs(clr1.b - clr2.b) > 2) + { + char tmp[256]; + sprintf(tmp, "test fail on function %s", pszFuncName); +#ifdef _MSC_VER + cv::Mat test1(256, 256, CV_8UC4, testdest1, 1024); + cv::Mat test2(256, 256, CV_8UC4, testdest2, 1024); +#endif + ShowInMessageBox(tmp); + //assert(!pszFuncName); + return; + } + } + //SDL_Log("cheking %s pass", pszFuncName); +} + +static void testTLG6_chroma() +{ + tjs_uint8 tmpbuff[(32 + 256) * 4 * 2 + 16]; + tjs_uint8 *block_src_ref = (tjs_uint8*)((((intptr_t)tmpbuff) + 7) & ~7); + tjs_uint8 *block_src = block_src_ref + 32 * 4; + tjs_uint32 *testdest1 = (tjs_uint32*)(block_src + 32 * 4); + tjs_uint32 *testdest2 = testdest1 + 256; + tjs_uint8 *psrc[4] = { + block_src, + block_src + 1, + block_src + 3, + block_src + 2, + }; + for (int i = 0; i < 32; ++i) { + for (int j = 0; j < 32 * 4; ++j) { + block_src_ref[j] = 240 - i - j * 3; + block_src[j] = 16 + i + j * 3; + } + for (tjs_uint8 ft = 0; ft < 32; ++ft) { + TVPTLG6DecodeLine_NEON((tjs_uint32 *)block_src_ref, testdest1, 64, /*0,*/ 1, &ft, 0, (tjs_uint32 *)block_src, 0, 0, 0); + TVPTLG6DecodeLineGeneric_c((tjs_uint32 *)block_src_ref, testdest2, 64, 0, 1, &ft, 0, (tjs_uint32 *)block_src, 0, 0, 0); + if (memcmp(testdest1, testdest2, 8 * 4) != 0) { + ShowInMessageBox("test fail on function TVPTLG6DecodeLineGeneric"); + assert(0); + } + } + + TVPTLG5ComposeColors3To4_NEON((tjs_uint8*)testdest1, block_src_ref, psrc, 67); + TVPTLG5ComposeColors3To4_c((tjs_uint8*)testdest2, block_src_ref, psrc, 67); + if (memcmp(testdest1, testdest2, 8 * 4) != 0) { + ShowInMessageBox("test fail on function TVPTLG5ComposeColors3To4"); + assert(0); + } + TVPTLG5ComposeColors4To4_NEON((tjs_uint8*)testdest1, block_src_ref, psrc, 67); + TVPTLG5ComposeColors4To4_c((tjs_uint8*)testdest2, block_src_ref, psrc, 67); + if (memcmp(testdest1, testdest2, 8 * 4) != 0) { + ShowInMessageBox("test fail on function TVPTLG5ComposeColors4To4"); + assert(0); + } + } +} + +#ifdef LOG_NEON_TEST +#define SHOW_AND_CLEAR_LOG ShowInMessageBox(LogData); pLogData = LogData; +#else +#define SHOW_AND_CLEAR_LOG +#endif + +#ifdef TEST_ARM_NEON_CODE + +#define REGISTER_TVPGL_BLEND_FUNC_2(origf, f) \ + InitTestData();\ + origf##_c(testdest2, testdata1, 256 * 256);\ + f = f##_NEON;\ + f##_NEON(testdest1, testdata1, 256 * 256);\ + CheckTestData(#f); +#define REGISTER_TVPGL_BLEND_FUNC(origf, f, ...) \ + InitTestData();\ + origf##_c(testdest2, testdata1, 256 * 256, __VA_ARGS__);\ + f = f##_NEON;\ + f##_NEON(testdest1, testdata1, 256 * 256, __VA_ARGS__);\ + CheckTestData(#f); +#define REGISTER_TVPGL_STRECH_FUNC_2(origf, f) \ + InitTestData();\ + origf##_c(testdest2, 16 * 256, testdata1, 0, 1 << 16);\ + f = f##_NEON;\ + f##_NEON(testdest1, 16 * 256, testdata1, 0, 1 << 16);\ + CheckTestData(#f); +#define REGISTER_TVPGL_STRECH_FUNC(origf, f, ...) \ + InitTestData();\ + origf##_c(testdest2, 16 * 256, testdata1, 0, 1 << 16, __VA_ARGS__);\ + f = f##_NEON;\ + f##_NEON(testdest1, 16 * 256, testdata1, 0, 1 << 16, __VA_ARGS__);\ + CheckTestData(#f); +#define REGISTER_TVPGL_LINTRANS_FUNC_2(origf, f) \ + InitTestData();\ + origf##_c(testdest2, 8 * 256, testdata1, 0, 0, 1<<16, 1<<16, 64);\ + f = f##_NEON;\ + f##_NEON(testdest1, 8 * 256, testdata1, 0, 0, 1<<16, 1<<16, 64);\ + CheckTestData(#f); +#define REGISTER_TVPGL_LINTRANS_FUNC(origf, f, ...) \ + InitTestData();\ + origf##_c(testdest2, 8 * 256, testdata1, 0, 0, 1<<16, 1<<16, 64, __VA_ARGS__);\ + f = f##_NEON;\ + f##_NEON(testdest1, 8 * 256, testdata1, 0, 0, 1<<16, 1<<16, 64, __VA_ARGS__);\ + CheckTestData(#f); +#define REGISTER_TVPGL_UNIVTRANS_FUNC(origf, f, ...) \ + InitTestData();\ + origf##_c(testdest2, testdata1, testdata2, testrule, testtable, 256 * 256, __VA_ARGS__);\ + f = f##_NEON;\ + f##_NEON(testdest1, testdata1, testdata2, testrule, testtable, 256 * 256, __VA_ARGS__);\ + CheckTestData_RGB(#f); +#define REGISTER_TVPGL_CUSTOM_FUNC(origf, f, ...) \ + InitTestData();\ + origf##_c(testdest2, __VA_ARGS__);\ + f = f##_NEON;\ + f##_NEON(testdest1, __VA_ARGS__);\ + CheckTestData(#f); +#define REGISTER_TVPGL_CUSTOM_FUNC_RGB(origf, f, ...) \ + InitTestData();\ + origf##_c(testdest2, __VA_ARGS__);\ + f = f##_NEON;\ + f##_NEON(testdest1, __VA_ARGS__);\ + CheckTestData_RGB(#f); +#define REGISTER_TVPGL_CUSTOM_FUNC_TYPE(origf, f, DT, ...) \ + InitTestData();\ + origf##_c((DT)testdest2, __VA_ARGS__);\ + f = f##_NEON;\ + f##_NEON((DT)testdest1, __VA_ARGS__);\ + CheckTestData(#f); +#else +#ifdef LOG_NEON_TEST + +static tjs_uint32 lastTick1, lastTick2; +static tjs_int tickC, tickNEON; +static unsigned int LogDataSize = 1024; +static char *LogData, *pLogData; + +static void AddLog(const char *format, ...) { + va_list args; + va_start(args, format); + char buf[256]; + vsnprintf(buf, 256 - 3, format, args); + char *p = buf; + if (!LogData) { + LogData = (char*)TJS_malloc(LogDataSize); + pLogData = LogData; + } + + while (*p) { + if (LogData + LogDataSize - pLogData <= 2) { + int used = pLogData - LogData; + LogDataSize += 1024; + LogData = (char*)TJS_realloc(LogData, LogDataSize); + pLogData = LogData + used; + } + *pLogData++ = *p++; + } + *pLogData++ = '\n'; + *pLogData = '\0'; + + + va_end(args); +} +#ifdef _MSC_VER +#define TEST_COUNT 0 +#else +#define TEST_COUNT 200 +#endif +#include "tjsCommHead.h" +#include "GraphicsLoaderIntf.h" +#include "UtilStreams.h" +#include "LayerBitmapIntf.h" +#include "LayerBitmapImpl.h" +#define TVP_clNone ((tjs_uint32)(0x1fffffff)) +void TVPLoadTLG(void* formatdata, void *callbackdata, tTVPGraphicSizeCallback sizecallback, + tTVPGraphicScanLineCallback scanlinecallback, tTVPMetaInfoPushCallback metainfopushcallback, + tTJSBinaryStream *src, tjs_int keyidx, tTVPGraphicLoadMode mode); +static void logTLG6_chroma() { + if (!TEST_COUNT) return; + tjs_uint8 tmpbuff[(32 + 256) * 4 * 2 + 16]; + tjs_uint8 *block_src_ref = (tjs_uint8*)((((intptr_t)tmpbuff) + 7) & ~7); + tjs_uint8 *block_src = block_src_ref + 32 * 4; + tjs_uint32 *testdest1 = (tjs_uint32*)(block_src + 32 * 4); + tjs_uint32 *testdest2 = testdest1 + 256; + tjs_uint8 *psrc[4] = { + block_src, + block_src + 1, + block_src + 3, + block_src + 2, + }; + tickC = 0; tickNEON = 0; + for (int i = 0; i < 32; ++i) { + for (int j = 0; j < 32 * 4; ++j) { + block_src_ref[j] = 240 - i - j * 3; + block_src[j] = 16 + i + j * 3; + } + lastTick1 = TVPGetRoughTickCount32(); + for (int n = 0; n < TEST_COUNT * 4; ++n) + for (tjs_uint8 ft = 0; ft < 32; ++ft) { + TVPTLG6DecodeLineGeneric_c((tjs_uint32 *)block_src_ref, testdest2, 64, 0, 1, &ft, 0, (tjs_uint32 *)block_src, 0, 0, 0); + } + tickC += TVPGetRoughTickCount32() - lastTick1; + lastTick1 = TVPGetRoughTickCount32(); + for (int n = 0; n < TEST_COUNT * 4; ++n) + for (tjs_uint8 ft = 0; ft < 32; ++ft) { + TVPTLG6DecodeLine_NEON((tjs_uint32 *)block_src_ref, testdest2, 64, 0, 1, &ft, 0, (tjs_uint32 *)block_src, 0, 0, 0); + } + tickNEON += TVPGetRoughTickCount32() - lastTick1; + } + AddLog("%s: %d ms, NEON: %d ms(%g%%)", "TVPTLG6DecodeLineGeneric", tickC, tickNEON, (float)tickNEON / tickC * 100); + + tickC = 0; tickNEON = 0; + for (int i = 0; i < 32; ++i) { + for (int j = 0; j < 32 * 4; ++j) { + block_src_ref[j] = 240 - i - j * 3; + block_src[j] = 16 + i + j * 3; + } + + lastTick1 = TVPGetRoughTickCount32(); + for (int n = 0; n < TEST_COUNT * 16; ++n) TVPTLG5ComposeColors3To4_c((tjs_uint8*)testdest2, block_src_ref, psrc, 67); + tickC += TVPGetRoughTickCount32() - lastTick1; + lastTick1 = TVPGetRoughTickCount32(); + for (int n = 0; n < TEST_COUNT * 16; ++n) TVPTLG5ComposeColors3To4_NEON((tjs_uint8*)testdest1, block_src_ref, psrc, 67); + tickNEON += TVPGetRoughTickCount32() - lastTick1; + } + AddLog("%s: %d ms, NEON: %d ms(%g%%)", "TVPTLG5ComposeColors3To4", tickC, tickNEON, (float)tickNEON / tickC * 100); + + tickC = 0; tickNEON = 0; + for (int i = 0; i < 32; ++i) { + for (int j = 0; j < 32 * 4; ++j) { + block_src_ref[j] = 240 - i - j * 3; + block_src[j] = 16 + i + j * 3; + } + + lastTick1 = TVPGetRoughTickCount32(); + for (int n = 0; n < TEST_COUNT * 16; ++n) TVPTLG5ComposeColors4To4_c((tjs_uint8*)testdest2, block_src_ref, psrc, 67); + tickC += TVPGetRoughTickCount32() - lastTick1; + lastTick1 = TVPGetRoughTickCount32(); + for (int n = 0; n < TEST_COUNT * 16; ++n) TVPTLG5ComposeColors4To4_NEON((tjs_uint8*)testdest1, block_src_ref, psrc, 67); + tickNEON += TVPGetRoughTickCount32() - lastTick1; + } + AddLog("%s: %d ms, NEON: %d ms(%g%%)", "TVPTLG5ComposeColors4To4", tickC, tickNEON, (float)tickNEON / tickC * 100); + + FILE *fp = fopen("/sdcard/KR2/test.tlg", "rb"); + if (fp) { + fseek(fp, 0, SEEK_END); + size_t n = ftell(fp); + fseek(fp, 0, SEEK_SET); + tTVPMemoryStream memio; memio.SetSize(n); + fread(memio.GetInternalBuffer(), 1, n, fp); + fclose(fp); + static tTVPBitmap *testbmp = nullptr; + TVPTLG5ComposeColors3To4 = TVPTLG5ComposeColors3To4_NEON; + TVPTLG5ComposeColors4To4 = TVPTLG5ComposeColors4To4_NEON; + TVPTLG5DecompressSlide = TVPTLG5DecompressSlide_c; + lastTick1 = TVPGetRoughTickCount32(); + for (int i = 0; i < 32; ++i) { + memio.SetPosition(0); + TVPLoadTLG(nullptr, nullptr, [](void *callbackdata, tjs_uint w, tjs_uint h, tTVPGraphicPixelFormat fmt)->int { + if (!testbmp) testbmp = new tTVPBitmap(w, h, 32); + return testbmp->GetPitch(); + }, [](void *callbackdata, tjs_int y)->void* { + return testbmp->GetScanLine(y); + }, [](void *callbackdata, const ttstr & name, const ttstr & value){}, &memio, TVP_clNone, glmNormal); + } + tickC = TVPGetRoughTickCount32() - lastTick1; + delete testbmp; testbmp = nullptr; + TVPTLG5DecompressSlide = TVPTLG5DecompressSlide_NEON; + lastTick1 = TVPGetRoughTickCount32(); + for (int i = 0; i < 32; ++i) { + memio.SetPosition(0); + TVPLoadTLG(nullptr, nullptr, [](void *callbackdata, tjs_uint w, tjs_uint h, tTVPGraphicPixelFormat fmt)->int { + if (!testbmp) testbmp = new tTVPBitmap(w, h, 32); + return testbmp->GetPitch(); + }, [](void *callbackdata, tjs_int y)->void* { + return testbmp->GetScanLine(y); + }, [](void *callbackdata, const ttstr & name, const ttstr & value){}, &memio, TVP_clNone, glmNormal); + } + tickNEON = TVPGetRoughTickCount32() - lastTick1; + delete testbmp; testbmp = nullptr; + AddLog("%s: %d ms, NEON: %d ms(%g%%)", "TVPLoadTLG5", tickC, tickNEON, (float)tickNEON / tickC * 100); + } +} + +#define REGISTER_TVPGL_BLEND_FUNC_2(origf, f) \ + InitTestData();\ + origf##_c(testdest2, testdata1, 256 * 256);\ + f = f##_NEON;\ + f##_NEON(testdest1, testdata1, 256 * 256);\ + CheckTestData(#f); if (TEST_COUNT) {\ + InitTestData();\ + lastTick1 = TVPGetRoughTickCount32();\ + for (int i = 0; i < TEST_COUNT; ++i) origf##_c(testdest2, testdata1, 256 * 256); \ + lastTick2 = TVPGetRoughTickCount32();\ + for (int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, testdata1, 256 * 256); \ + tickC = lastTick2 - lastTick1; tickNEON = TVPGetRoughTickCount32() - lastTick2; \ + AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, tickC, tickNEON, (float)tickNEON / tickC * 100); \ + f = f##_NEON; } + +#define REGISTER_TVPGL_BLEND_FUNC(origf, f, ...) \ + InitTestData();\ + origf##_c(testdest2, testdata1, 256 * 256, __VA_ARGS__);\ + f = f##_NEON;\ + f##_NEON(testdest1, testdata1, 256 * 256, __VA_ARGS__);\ + CheckTestData(#f); if (TEST_COUNT) {\ + InitTestData();\ + lastTick1 = TVPGetRoughTickCount32();\ + for (int i = 0; i < TEST_COUNT; ++i) origf##_c(testdest2, testdata1, 256 * 256, __VA_ARGS__); \ + lastTick2 = TVPGetRoughTickCount32();\ + for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, testdata1, 256 * 256, __VA_ARGS__);\ + tickC = lastTick2 - lastTick1; tickNEON = TVPGetRoughTickCount32() - lastTick2; \ + AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, tickC, tickNEON, (float)tickNEON / tickC * 100); \ + f = f##_NEON; } + +#define REGISTER_TVPGL_STRECH_FUNC_2(origf, f, ...) \ + InitTestData();\ + origf##_c(testdest2, 127 * 256, testdata1, 0, 1 << 16);\ + f = f##_NEON;\ + f##_NEON(testdest1, 127 * 256, testdata1, 0, 1 << 16);\ + CheckTestData(#f); if (TEST_COUNT) {\ + InitTestData();\ + lastTick1 = TVPGetRoughTickCount32();\ + for(int i = 0; i < TEST_COUNT; ++i) origf##_c(testdest2, 127 * 256, testdata1, 0, 1 << 16); \ + lastTick2 = TVPGetRoughTickCount32(); \ + for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, 127 * 256, testdata1, 0, 1 << 16);\ + tickC = lastTick2 - lastTick1; tickNEON = TVPGetRoughTickCount32() - lastTick2; \ + AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, tickC, tickNEON, (float)tickNEON / tickC * 100); \ + f = f##_NEON; } +#define REGISTER_TVPGL_STRECH_FUNC(origf, f, ...) \ + InitTestData();\ + origf##_c(testdest2, 127 * 256, testdata1, 0, 1 << 16, __VA_ARGS__);\ + f = f##_NEON;\ + f##_NEON(testdest1, 127 * 256, testdata1, 0, 1 << 16, __VA_ARGS__);\ + CheckTestData(#f); if (TEST_COUNT) {\ + InitTestData();\ + lastTick1 = TVPGetRoughTickCount32();\ + for(int i = 0; i < TEST_COUNT; ++i) origf##_c(testdest2, 127 * 256, testdata1, 0, 1 << 16, __VA_ARGS__);\ + lastTick2 = TVPGetRoughTickCount32(); \ + for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, 127 * 256, testdata1, 0, 1 << 16, __VA_ARGS__);\ + tickC = lastTick2 - lastTick1; tickNEON = TVPGetRoughTickCount32() - lastTick2; \ + AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, tickC, tickNEON, (float)tickNEON / tickC * 100); \ + f = f##_NEON; } +#define REGISTER_TVPGL_STRECH_FUNC_0(origf, f) \ + InitTestData();\ + origf##_c(testdest2, 127 * 256, testdata1, 0, 1 << 16);\ + f = f##_NEON;\ + f##_NEON(testdest1, 127 * 256, testdata1, 0, 1 << 16);\ + CheckTestData(#f); if (TEST_COUNT) {\ + InitTestData();\ + lastTick1 = TVPGetRoughTickCount32();\ + for(int i = 0; i < TEST_COUNT; ++i) origf##_c(testdest2, 127 * 256, testdata1, 0, 1 << 16);\ + lastTick2 = TVPGetRoughTickCount32(); \ + for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, 127 * 256, testdata1, 0, 1 << 16);\ + tickC = lastTick2 - lastTick1; tickNEON = TVPGetRoughTickCount32() - lastTick2; \ + AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, tickC, tickNEON, (float)tickNEON / tickC * 100); \ + f = f##_NEON; } +#define REGISTER_TVPGL_LINTRANS_FUNC_2(origf, f) \ + InitTestData();\ + origf##_c(testdest2, 127 * 256, testdata1, 0, 0, 1 << 16, 0, 256); \ + f = f##_NEON;\ + f##_NEON(testdest1, 127 * 256, testdata1, 0, 0, 1 << 16, 0, 256); \ + CheckTestData(#f); if (TEST_COUNT) {\ + InitTestData();\ + lastTick1 = TVPGetRoughTickCount32();\ + for (int i = 0; i < TEST_COUNT; ++i) origf##_c(testdest2, 127 * 256, testdata1, 0, 0, 1 << 16, 0, 256); \ + lastTick2 = TVPGetRoughTickCount32(); \ + for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, 127 * 256, testdata1, 0, 0, 1<<16, 0, 256);\ + tickC = lastTick2 - lastTick1; tickNEON = TVPGetRoughTickCount32() - lastTick2; \ + AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, tickC, tickNEON, (float)tickNEON / tickC * 100); \ + f = f##_NEON; } +#define REGISTER_TVPGL_LINTRANS_FUNC(origf, f, ...) \ + InitTestData();\ + origf##_c(testdest2, 127 * 256, testdata1, 0, 0, 1<<16, 0, 256, __VA_ARGS__);\ + f = f##_NEON;\ + f##_NEON(testdest1, 127 * 256, testdata1, 0, 0, 1<<16, 0, 256, __VA_ARGS__);\ + CheckTestData(#f); if (TEST_COUNT) {\ + InitTestData();\ + lastTick1 = TVPGetRoughTickCount32();\ + for(int i = 0; i < TEST_COUNT; ++i) origf##_c(testdest2, 127 * 256, testdata1, 0, 0, 1<<16, 0, 256, __VA_ARGS__);\ + lastTick2 = TVPGetRoughTickCount32(); \ + for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, 127 * 256, testdata1, 0, 0, 1<<16, 0, 256, __VA_ARGS__);\ + tickC = lastTick2 - lastTick1; tickNEON = TVPGetRoughTickCount32() - lastTick2; \ + AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, tickC, tickNEON, (float)tickNEON / tickC * 100); \ + f = f##_NEON; } +#define REGISTER_TVPGL_UNIVTRANS_FUNC(origf, f, ...) \ + InitTestData();\ + origf##_c(testdest2, testdata1, testdata2, testrule, testtable, 256 * 256, __VA_ARGS__);\ + f = f##_NEON;\ + f##_NEON(testdest1, testdata1, testdata2, testrule, testtable, 256 * 256, __VA_ARGS__);\ + CheckTestData(#f); if (TEST_COUNT) {\ + InitTestData();\ + lastTick1 = TVPGetRoughTickCount32();\ + for(int i = 0; i < TEST_COUNT; ++i) origf##_c(testdest2, testdata1, testdata2, testrule, testtable, 256 * 256, __VA_ARGS__);\ + lastTick2 = TVPGetRoughTickCount32(); \ + for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, testdata1, testdata2, testrule, testtable, 256 * 256, __VA_ARGS__);\ + tickC = lastTick2 - lastTick1; tickNEON = TVPGetRoughTickCount32() - lastTick2; \ + AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, tickC, tickNEON, (float)tickNEON / tickC * 100); \ + f = f##_NEON; } +#define REGISTER_TVPGL_CUSTOM_FUNC(origf, f, ...) \ + InitTestData();\ + origf##_c(testdest2, __VA_ARGS__);\ + f = f##_NEON;\ + f##_NEON(testdest1, __VA_ARGS__);\ + CheckTestData(#f); if (TEST_COUNT) {\ + InitTestData();\ + lastTick1 = TVPGetRoughTickCount32();\ + for(int i = 0; i < TEST_COUNT; ++i) origf##_c(testdest2, __VA_ARGS__);\ + lastTick2 = TVPGetRoughTickCount32(); \ + for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, __VA_ARGS__);\ + tickC = lastTick2 - lastTick1; tickNEON = TVPGetRoughTickCount32() - lastTick2; \ + AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, tickC, tickNEON, (float)tickNEON / tickC * 100); \ + f = f##_NEON; } +#define REGISTER_TVPGL_CUSTOM_FUNC_RGB(origf, f, ...) \ + InitTestData();\ + origf##_c(testdest2, __VA_ARGS__);\ + f = f##_NEON;\ + f##_NEON(testdest1, __VA_ARGS__);\ + CheckTestData_RGB(#f); if (TEST_COUNT) {\ + InitTestData();\ + lastTick1 = TVPGetRoughTickCount32();\ + for(int i = 0; i < TEST_COUNT; ++i) origf##_c(testdest2, __VA_ARGS__);\ + lastTick2 = TVPGetRoughTickCount32(); \ + for(int i = 0; i < TEST_COUNT; ++i) f##_NEON(testdest1, __VA_ARGS__);\ + tickC = lastTick2 - lastTick1; tickNEON = TVPGetRoughTickCount32() - lastTick2; \ + AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, tickC, tickNEON, (float)tickNEON / tickC * 100); \ + f = f##_NEON; } +#define REGISTER_TVPGL_CUSTOM_FUNC_TYPE(origf, f, DT, ...) \ + InitTestData();\ + origf##_c((DT)testdest2, __VA_ARGS__);\ + f = f##_NEON;\ + f##_NEON((DT)testdest1, __VA_ARGS__);\ + CheckTestData(#f); if (TEST_COUNT) {\ + InitTestData(); \ + lastTick1 = TVPGetRoughTickCount32(); \ + for (int i = 0; i < TEST_COUNT; ++i) origf##_c((DT)testdest2, __VA_ARGS__); \ + lastTick2 = TVPGetRoughTickCount32(); \ + for (int i = 0; i < TEST_COUNT; ++i) f##_NEON((DT)testdest1, __VA_ARGS__);\ + tickC = lastTick2 - lastTick1; tickNEON = TVPGetRoughTickCount32() - lastTick2; \ + AddLog("%s: %d ms, NEON: %d ms(%g%%)", #f, tickC, tickNEON, (float)tickNEON / tickC * 100); \ + f = f##_NEON; } +#else +#define REGISTER_TVPGL_BLEND_FUNC_2(origf, f, ...) f = f##_NEON; +#define REGISTER_TVPGL_BLEND_FUNC(origf, f, ...) f = f##_NEON; +#define REGISTER_TVPGL_STRECH_FUNC_2(origf, f, ...) f = f##_NEON; +#define REGISTER_TVPGL_STRECH_FUNC(origf, f, ...) f = f##_NEON; +#define REGISTER_TVPGL_LINTRANS_FUNC_2(origf, f, ...) f = f##_NEON; +#define REGISTER_TVPGL_LINTRANS_FUNC(origf, f, ...) f = f##_NEON; +#define REGISTER_TVPGL_UNIVTRANS_FUNC(origf, f, ...) f = f##_NEON; +#define REGISTER_TVPGL_CUSTOM_FUNC(origf, f, ...) f = f##_NEON; +#define REGISTER_TVPGL_CUSTOM_FUNC_RGB(origf, f, ...) f = f##_NEON; +#define REGISTER_TVPGL_CUSTOM_FUNC_TYPE(origf, f, ...) f = f##_NEON; +#endif +#endif +#define REGISTER_TVPGL_ONLY(origf, f) origf = f; + +#include "Protect.h" +#ifdef __cplusplus +extern "C" { +#endif +#include "tvpgl_arm_route.h" +#ifdef __cplusplus +}; +#endif + +FUNC_API void calcBezierPatch_c(float* result, /*const */float* arr/*16*/, /*const */float* a3); +FUNC_API void calcBezierPatch_NEON(float* result, float* arr/*16*/, float* p); + +FUNC_API void TVPGL_ASM_Init() +{ + if ((TVPCPUFeatures & TVP_CPU_FAMILY_MASK) == TVP_CPU_FAMILY_ARM && (TVPCPUFeatures & TVP_CPU_HAS_NEON)) + { + TVPInitTVPGL(); +#ifdef LOG_NEON_TEST +#if 0 + do { // test calcBezierPatch + float arr[32]; + float resultC[2], resultNEON[2], + pt[2] = { + ((rand() & 1) ? -1 : 1) * ((float)rand() / rand()), + ((rand() & 1) ? -1 : 1) * ((float)rand() / rand()) + }; + for (int i = 0; i < 32; ++i) { + arr[i] = ((rand() & 1) ? -1 : 1) * ((float)rand() / rand()); + } + calcBezierPatch_c(resultC, arr, pt); + calcBezierPatch_NEON(resultNEON, arr, pt); + if (resultC[0] != resultNEON[0] || resultC[1] != resultNEON[1]) { + ShowInMessageBox("test calcBezierPatch fail"); + } + if (!TEST_COUNT) break; + for (int i = 0; i < 4; ++i) { + lastTick1 = TVPGetRoughTickCount32(); + for (int i = 0; i < 160000; ++i) calcBezierPatch_c(resultC, arr, pt); + lastTick2 = TVPGetRoughTickCount32(); + for (int i = 0; i < 160000; ++i) calcBezierPatch_NEON(resultNEON, arr, pt); + AddLog("calcBezierPatch: %d ms, NEON: %d ms(%g%%)", (tickC = lastTick2 - lastTick1), (tickNEON = TVPGetRoughTickCount32() - lastTick2), (float)tickNEON / tickC * 100); + } + SHOW_AND_CLEAR_LOG; + } while (0); +#endif +#undef TEST_COUNT +#define TEST_COUNT 1000 +// REGISTER_TVPGL_STRECH_FUNC_2(TVPStretchAlphaBlend_d, TVPStretchAlphaBlend_d); +// REGISTER_TVPGL_STRECH_FUNC_2(TVPStretchAlphaBlend_d, TVPStretchAlphaBlend_d); +// REGISTER_TVPGL_STRECH_FUNC_2(TVPStretchAlphaBlend_d, TVPStretchAlphaBlend_d); +// REGISTER_TVPGL_STRECH_FUNC_2(TVPStretchAlphaBlend_d, TVPStretchAlphaBlend_d); +// REGISTER_TVPGL_STRECH_FUNC(TVPStretchAlphaBlend_do, TVPStretchAlphaBlend_do, 100); +// REGISTER_TVPGL_STRECH_FUNC(TVPStretchAlphaBlend_do, TVPStretchAlphaBlend_do, 100); +// REGISTER_TVPGL_STRECH_FUNC(TVPStretchAlphaBlend_do, TVPStretchAlphaBlend_do, 100); +// REGISTER_TVPGL_STRECH_FUNC(TVPStretchAlphaBlend_do, TVPStretchAlphaBlend_do, 100); +// SHOW_AND_CLEAR_LOG; +#undef TEST_COUNT +#define TEST_COUNT 200 + testTLG6_chroma(); + logTLG6_chroma(); +#endif +#if 1 + + REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPAlphaBlend, TVPAlphaBlend, testdata1, 256 * 256); + REGISTER_TVPGL_ONLY(TVPAlphaBlend_HDA, TVPAlphaBlend_NEON); + REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPAlphaBlend_o, TVPAlphaBlend_o, testdata1, 256 * 256, 100); + REGISTER_TVPGL_ONLY(TVPAlphaBlend_HDA_o, TVPAlphaBlend_o_NEON); + REGISTER_TVPGL_CUSTOM_FUNC(TVPAlphaBlend_d, TVPAlphaBlend_d, testdata1, 256 * 256); + REGISTER_TVPGL_CUSTOM_FUNC(TVPAlphaBlend_a, TVPAlphaBlend_a, testdata1, 256 * 256); + REGISTER_TVPGL_CUSTOM_FUNC(TVPAlphaBlend_do, TVPAlphaBlend_do, testdata1, 256 * 256, 100); + REGISTER_TVPGL_CUSTOM_FUNC(TVPAlphaBlend_ao, TVPAlphaBlend_ao, testdata1, 256 * 256, 100); + + REGISTER_TVPGL_CUSTOM_FUNC(TVPAlphaColorMat, TVPAlphaColorMat, 0x98765432, 256 * 256); + REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPAdditiveAlphaBlend, TVPAdditiveAlphaBlend, testdata1, 256 * 256); + REGISTER_TVPGL_ONLY(TVPAdditiveAlphaBlend_HDA, TVPAdditiveAlphaBlend_NEON); + REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPAdditiveAlphaBlend_o, TVPAdditiveAlphaBlend_o, testdata1, 256 * 256, 100); + REGISTER_TVPGL_ONLY(TVPAdditiveAlphaBlend_HDA_o, TVPAdditiveAlphaBlend_o_NEON); + REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPAdditiveAlphaBlend_a, TVPAdditiveAlphaBlend_a, testdata1, 256 * 256); + REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPAdditiveAlphaBlend_ao, TVPAdditiveAlphaBlend_ao, testdata1, 256 * 256, 100); + REGISTER_TVPGL_CUSTOM_FUNC(TVPConvertAlphaToAdditiveAlpha, TVPConvertAlphaToAdditiveAlpha, 256 * 256); + + SHOW_AND_CLEAR_LOG; + + REGISTER_TVPGL_CUSTOM_FUNC(TVPAlphaColorMat, TVPAlphaColorMat, 0x98765432, 256 * 256); + REGISTER_TVPGL_CUSTOM_FUNC(TVPStretchAlphaBlend_HDA, TVPStretchAlphaBlend, 16 * 256, testdata1, 0, 1 << 16); + REGISTER_TVPGL_ONLY(TVPStretchAlphaBlend_HDA, TVPStretchAlphaBlend_NEON); + REGISTER_TVPGL_CUSTOM_FUNC(TVPStretchAlphaBlend_o, TVPStretchAlphaBlend_o, 16 * 256, testdata1, 0, 1 << 16, 100); + REGISTER_TVPGL_ONLY(TVPStretchAlphaBlend_HDA_o, TVPStretchAlphaBlend_o_NEON); + REGISTER_TVPGL_STRECH_FUNC_2(TVPStretchAlphaBlend_d, TVPStretchAlphaBlend_d); + REGISTER_TVPGL_STRECH_FUNC_2(TVPStretchAlphaBlend_a, TVPStretchAlphaBlend_a); + REGISTER_TVPGL_STRECH_FUNC(TVPStretchAlphaBlend_do, TVPStretchAlphaBlend_do, 100); + REGISTER_TVPGL_STRECH_FUNC(TVPStretchAlphaBlend_ao, TVPStretchAlphaBlend_ao, 100); + + REGISTER_TVPGL_STRECH_FUNC_2(TVPStretchAdditiveAlphaBlend_HDA, TVPStretchAdditiveAlphaBlend); + REGISTER_TVPGL_ONLY(TVPStretchAdditiveAlphaBlend_HDA, TVPStretchAdditiveAlphaBlend_NEON); + REGISTER_TVPGL_STRECH_FUNC(TVPStretchAdditiveAlphaBlend_HDA_o, TVPStretchAdditiveAlphaBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPStretchAdditiveAlphaBlend_HDA_o, TVPStretchAdditiveAlphaBlend_o_NEON); + REGISTER_TVPGL_STRECH_FUNC_2(TVPStretchAdditiveAlphaBlend_a, TVPStretchAdditiveAlphaBlend_a); + REGISTER_TVPGL_STRECH_FUNC(TVPStretchAdditiveAlphaBlend_ao, TVPStretchAdditiveAlphaBlend_ao, 100); + + SHOW_AND_CLEAR_LOG; + + REGISTER_TVPGL_LINTRANS_FUNC_2(TVPLinTransAlphaBlend_HDA, TVPLinTransAlphaBlend); + REGISTER_TVPGL_ONLY(TVPLinTransAlphaBlend_HDA, TVPLinTransAlphaBlend_NEON); + REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransAlphaBlend_HDA_o, TVPLinTransAlphaBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPLinTransAlphaBlend_HDA_o, TVPLinTransAlphaBlend_o_NEON); + REGISTER_TVPGL_LINTRANS_FUNC_2(TVPLinTransAlphaBlend_d, TVPLinTransAlphaBlend_d); // performance issue ! + REGISTER_TVPGL_LINTRANS_FUNC_2(TVPLinTransAlphaBlend_a, TVPLinTransAlphaBlend_a); + REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransAlphaBlend_do, TVPLinTransAlphaBlend_do, 100); + REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransAlphaBlend_ao, TVPLinTransAlphaBlend_ao, 100); + + REGISTER_TVPGL_LINTRANS_FUNC_2(TVPLinTransAdditiveAlphaBlend_HDA, TVPLinTransAdditiveAlphaBlend); + REGISTER_TVPGL_ONLY(TVPLinTransAdditiveAlphaBlend_HDA, TVPLinTransAdditiveAlphaBlend_NEON); + REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransAdditiveAlphaBlend_HDA_o, TVPLinTransAdditiveAlphaBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPLinTransAdditiveAlphaBlend_HDA_o, TVPLinTransAdditiveAlphaBlend_o_NEON); + REGISTER_TVPGL_LINTRANS_FUNC_2(TVPLinTransAdditiveAlphaBlend_a, TVPLinTransAdditiveAlphaBlend_a); + REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransAdditiveAlphaBlend_ao, TVPLinTransAdditiveAlphaBlend_ao, 100); + + SHOW_AND_CLEAR_LOG; + +// REGISTER_TVPGL_CUSTOM_FUNC(TVPInterpStretchCopy, TVPInterpStretchCopy, +// 127 * 256, testdata1, testdata2, 127, 0, 1 << 16); // performance issue ! + REGISTER_TVPGL_LINTRANS_FUNC_2(TVPInterpLinTransCopy, TVPInterpLinTransCopy); + REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPInterpStretchAdditiveAlphaBlend, TVPInterpStretchAdditiveAlphaBlend, + 16 * 256, testdata1, testdata2, 127, 0, 1 << 16); + REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPInterpStretchAdditiveAlphaBlend_o, TVPInterpStretchAdditiveAlphaBlend_o, + 16 * 256, testdata1, testdata2, 127, 0, 1 << 16, 100); + REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPInterpLinTransAdditiveAlphaBlend, TVPInterpLinTransAdditiveAlphaBlend, + 8 * 256, testdata1, 0, 0, 1 << 16, 1 << 16, 64); + REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPInterpLinTransAdditiveAlphaBlend_o, TVPInterpLinTransAdditiveAlphaBlend_o, + 8 * 256, testdata1, 0, 0, 1 << 16, 1 << 16, 64, 100); + + REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPInterpStretchConstAlphaBlend, TVPInterpStretchConstAlphaBlend, + 16 * 256, testdata1, testdata2, 127, 0, 1 << 16, 100); + REGISTER_TVPGL_LINTRANS_FUNC(TVPInterpLinTransConstAlphaBlend, TVPInterpLinTransConstAlphaBlend, 100); + + SHOW_AND_CLEAR_LOG; + + REGISTER_TVPGL_BLEND_FUNC_2(TVPCopyOpaqueImage, TVPCopyOpaqueImage); + REGISTER_TVPGL_CUSTOM_FUNC(TVPStretchCopyOpaqueImage, TVPStretchCopyOpaqueImage, 127 * 256, testdata1, 0, 1 << 16); + REGISTER_TVPGL_LINTRANS_FUNC_2(TVPLinTransCopyOpaqueImage, TVPLinTransCopyOpaqueImage); // performance issue ! + REGISTER_TVPGL_BLEND_FUNC(TVPConstAlphaBlend_HDA, TVPConstAlphaBlend, 100); + REGISTER_TVPGL_ONLY(TVPConstAlphaBlend_HDA, TVPConstAlphaBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPConstAlphaBlend_d, TVPConstAlphaBlend_d, 100); + REGISTER_TVPGL_BLEND_FUNC(TVPConstAlphaBlend_a, TVPConstAlphaBlend_a, 100); + + REGISTER_TVPGL_STRECH_FUNC(TVPStretchConstAlphaBlend_HDA, TVPStretchConstAlphaBlend, 100); + REGISTER_TVPGL_ONLY(TVPStretchConstAlphaBlend_HDA, TVPStretchConstAlphaBlend_NEON); + REGISTER_TVPGL_STRECH_FUNC(TVPStretchConstAlphaBlend_d, TVPStretchConstAlphaBlend_d, 100); + REGISTER_TVPGL_ONLY(TVPStretchConstAlphaBlend_d, TVPStretchConstAlphaBlend_d_NEON); + REGISTER_TVPGL_STRECH_FUNC(TVPStretchConstAlphaBlend_a, TVPStretchConstAlphaBlend_a, 100); + + REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransConstAlphaBlend_HDA, TVPLinTransConstAlphaBlend, 100); // performance issue ! + REGISTER_TVPGL_ONLY(TVPLinTransConstAlphaBlend_HDA, TVPLinTransConstAlphaBlend_NEON); // performance issue ! + REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransConstAlphaBlend_d, TVPLinTransConstAlphaBlend_d, 100); // performance issue ! + REGISTER_TVPGL_LINTRANS_FUNC(TVPLinTransConstAlphaBlend_a, TVPLinTransConstAlphaBlend_a, 100); + + SHOW_AND_CLEAR_LOG; + + REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPConstAlphaBlend_SD, TVPConstAlphaBlend_SD, testdata1, testdata2, 256 * 256, 100); + REGISTER_TVPGL_CUSTOM_FUNC(TVPConstAlphaBlend_SD_a, TVPConstAlphaBlend_SD_a, testdata1, testdata2, 256 * 256, 100); + REGISTER_TVPGL_CUSTOM_FUNC(TVPConstAlphaBlend_SD_d, TVPConstAlphaBlend_SD_d, testdata1, testdata2, 256 * 256, 100); + + // TVPInitUnivTransBlendTable + REGISTER_TVPGL_CUSTOM_FUNC_RGB(TVPUnivTransBlend, TVPUnivTransBlend, testdata1, testdata2, testrule, testtable, 256 * 256); + REGISTER_TVPGL_CUSTOM_FUNC(TVPUnivTransBlend_d, TVPUnivTransBlend_d, testdata1, testdata2, testrule, testtable, 256 * 256); + REGISTER_TVPGL_CUSTOM_FUNC(TVPUnivTransBlend_a, TVPUnivTransBlend_a, testdata1, testdata2, testrule, testtable, 256 * 256); + REGISTER_TVPGL_UNIVTRANS_FUNC(TVPUnivTransBlend_switch, TVPUnivTransBlend_switch, 240, 32); + REGISTER_TVPGL_UNIVTRANS_FUNC(TVPUnivTransBlend_switch_d, TVPUnivTransBlend_switch_d, 240, 32); + REGISTER_TVPGL_UNIVTRANS_FUNC(TVPUnivTransBlend_switch_a, TVPUnivTransBlend_switch_a, 240, 32); + + REGISTER_TVPGL_CUSTOM_FUNC(TVPApplyColorMap_HDA, TVPApplyColorMap, testrule, 256 * 256, 0x55d20688); + REGISTER_TVPGL_ONLY(TVPApplyColorMap_HDA, TVPApplyColorMap_NEON); + REGISTER_TVPGL_CUSTOM_FUNC(TVPApplyColorMap_HDA_o, TVPApplyColorMap_o, testrule, 256 * 256, 0x55d20688, 100); + REGISTER_TVPGL_ONLY(TVPApplyColorMap_HDA_o, TVPApplyColorMap_o_NEON); + REGISTER_TVPGL_CUSTOM_FUNC(TVPApplyColorMap_d, TVPApplyColorMap_d, testrule, 256 * 256, 0x55d20688); + REGISTER_TVPGL_CUSTOM_FUNC(TVPApplyColorMap_a, TVPApplyColorMap_a, testrule, 256 * 256, 0x55d20688); + REGISTER_TVPGL_CUSTOM_FUNC(TVPApplyColorMap_do, TVPApplyColorMap_do, testrule, 256 * 256, 0x55d20688, 100); + REGISTER_TVPGL_CUSTOM_FUNC(TVPApplyColorMap_ao, TVPApplyColorMap_ao, testrule, 256 * 256, 0x55d20688, 100); + + SHOW_AND_CLEAR_LOG; + + REGISTER_TVPGL_CUSTOM_FUNC(TVPConstColorAlphaBlend, TVPConstColorAlphaBlend, 256 * 256, 0x55d20688, 100); + REGISTER_TVPGL_CUSTOM_FUNC(TVPConstColorAlphaBlend_d, TVPConstColorAlphaBlend_d, 256 * 256, 0x55d20688, 100); + REGISTER_TVPGL_CUSTOM_FUNC(TVPConstColorAlphaBlend_a, TVPConstColorAlphaBlend_a, 256 * 256, 0x55d20688, 100); + + REGISTER_TVPGL_CUSTOM_FUNC(TVPRemoveConstOpacity, TVPRemoveConstOpacity, 256 * 256, 100); + REGISTER_TVPGL_CUSTOM_FUNC(TVPRemoveOpacity, TVPRemoveOpacity, testrule, 255 * 256); + REGISTER_TVPGL_CUSTOM_FUNC(TVPRemoveOpacity_o, TVPRemoveOpacity_o, testrule, 255 * 256, 100); + + REGISTER_TVPGL_BLEND_FUNC_2(TVPAddBlend, TVPAddBlend); + REGISTER_TVPGL_BLEND_FUNC_2(TVPAddBlend_HDA, TVPAddBlend_HDA); + REGISTER_TVPGL_BLEND_FUNC(TVPAddBlend_HDA_o, TVPAddBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPAddBlend_HDA_o, TVPAddBlend_o_NEON); + + REGISTER_TVPGL_BLEND_FUNC_2(TVPSubBlend, TVPSubBlend); + REGISTER_TVPGL_BLEND_FUNC_2(TVPSubBlend_HDA, TVPSubBlend_HDA); + REGISTER_TVPGL_BLEND_FUNC(TVPSubBlend_HDA_o, TVPSubBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPSubBlend_HDA_o, TVPSubBlend_o_NEON); + + REGISTER_TVPGL_BLEND_FUNC_2(TVPMulBlend_HDA, TVPMulBlend_HDA); + REGISTER_TVPGL_BLEND_FUNC_2(TVPMulBlend, TVPMulBlend); + REGISTER_TVPGL_BLEND_FUNC(TVPMulBlend_HDA_o, TVPMulBlend_HDA_o, 100); + REGISTER_TVPGL_BLEND_FUNC(TVPMulBlend_o, TVPMulBlend_o, 100); + + SHOW_AND_CLEAR_LOG; + +// REGISTER_TVPGL_BLEND_FUNC_2(TVPColorDodgeBlend_HDA, TVPColorDodgeBlend); // performance issue +// REGISTER_TVPGL_ONLY(TVPColorDodgeBlend_HDA, TVPColorDodgeBlend_NEON); +// REGISTER_TVPGL_BLEND_FUNC(TVPColorDodgeBlend_HDA_o, TVPColorDodgeBlend_o, 100); +// REGISTER_TVPGL_ONLY(TVPColorDodgeBlend_HDA_o, TVPColorDodgeBlend_o_NEON); + REGISTER_TVPGL_BLEND_FUNC_2(TVPDarkenBlend_HDA, TVPDarkenBlend); + REGISTER_TVPGL_ONLY(TVPDarkenBlend_HDA, TVPDarkenBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPDarkenBlend_HDA_o, TVPDarkenBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPDarkenBlend_HDA_o, TVPDarkenBlend_o_NEON); + REGISTER_TVPGL_BLEND_FUNC_2(TVPLightenBlend_HDA, TVPLightenBlend); + REGISTER_TVPGL_ONLY(TVPLightenBlend_HDA, TVPLightenBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPLightenBlend_HDA_o, TVPLightenBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPLightenBlend_HDA_o, TVPLightenBlend_o_NEON); + REGISTER_TVPGL_BLEND_FUNC_2(TVPScreenBlend_HDA, TVPScreenBlend); + REGISTER_TVPGL_ONLY(TVPScreenBlend_HDA, TVPScreenBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPScreenBlend_HDA_o, TVPScreenBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPScreenBlend_HDA_o, TVPScreenBlend_o_NEON); + + SHOW_AND_CLEAR_LOG; + +// TVPFastLinearInterpH2F, TVPFastLinearInterpH2F_c; +// TVPFastLinearInterpH2B, TVPFastLinearInterpH2B_c; + + REGISTER_TVPGL_CUSTOM_FUNC(TVPFastLinearInterpV2, TVPFastLinearInterpV2, + 256 * 256, testdata1, testdata2); + + //TVPStretchColorCopy, TVPStretchColorCopy_c; + + //TVPMakeAlphaFromKey, TVPMakeAlphaFromKey_c; + +// REGISTER_TVPGL_BLEND_FUNC_2(TVPCopyMask, TVPCopyMask); + REGISTER_TVPGL_BLEND_FUNC_2(TVPCopyColor, TVPCopyColor); + REGISTER_TVPGL_CUSTOM_FUNC(TVPBindMaskToMain, TVPBindMaskToMain, testrule, 256 * 256); + + // NEON's TVPFillARGB is slower than plain C +// REGISTER_TVPGL_CUSTOM_FUNC(TVPFillARGB, TVPFillARGB, 256 * 256, 0x55d20688); +// REGISTER_TVPGL_ONLY(TVPFillARGB_NC, TVPFillARGB_NEON); + + SHOW_AND_CLEAR_LOG; + + REGISTER_TVPGL_CUSTOM_FUNC(TVPFillColor, TVPFillColor, 256 * 256, 0x55d20688); + REGISTER_TVPGL_CUSTOM_FUNC(TVPFillMask, TVPFillMask, 256 * 256, 0x55d20688); + REGISTER_TVPGL_CUSTOM_FUNC_TYPE(TVPAddSubVertSum16, TVPAddSubVertSum16, tjs_uint16*, testdata1, testdata2, 128 * 256); + REGISTER_TVPGL_CUSTOM_FUNC_TYPE(TVPAddSubVertSum16_d, TVPAddSubVertSum16_d, tjs_uint16*, testdata1, testdata2, 128 * 256); + +// TVPAddSubVertSum32, TVPAddSubVertSum32_c; +// TVPAddSubVertSum32_d, TVPAddSubVertSum32_d_c; +// TVPDoBoxBlurAvg16, TVPDoBoxBlurAvg16_c; +// TVPDoBoxBlurAvg16_d, TVPDoBoxBlurAvg16_d_c; +// TVPDoBoxBlurAvg32, TVPDoBoxBlurAvg32_c; +// TVPDoBoxBlurAvg32_d, TVPDoBoxBlurAvg32_d_c; +// TVPSwapLine8, TVPSwapLine8_c; +// TVPSwapLine32, TVPSwapLine32_c; +// TVPReverse8, TVPReverse8_c; +// TVPReverse32, TVPReverse32_c; + REGISTER_TVPGL_CUSTOM_FUNC(TVPDoGrayScale, TVPDoGrayScale, 256 * 256); +// TVPInitGammaAdjustTempData, TVPInitGammaAdjustTempData_c; +// TVPUninitGammaAdjustTempData, TVPUninitGammaAdjustTempData_c; +// TVPAdjustGamma, TVPAdjustGamma_c; +// TVPAdjustGamma_a, TVPAdjustGamma_a_c; +// TVPChBlurMulCopy65, TVPChBlurMulCopy65_c; +// TVPChBlurAddMulCopy65, TVPChBlurAddMulCopy65_c; +// TVPChBlurCopy65, TVPChBlurCopy65_c; +// TVPBLExpand1BitTo8BitPal, TVPBLExpand1BitTo8BitPal_c; +// TVPBLExpand1BitTo8Bit, TVPBLExpand1BitTo8Bit_c; +// TVPBLExpand1BitTo32BitPal, TVPBLExpand1BitTo32BitPal_c; +// TVPBLExpand4BitTo8BitPal, TVPBLExpand4BitTo8BitPal_c; +// TVPBLExpand4BitTo8Bit, TVPBLExpand4BitTo8Bit_c; +// TVPBLExpand4BitTo32BitPal, TVPBLExpand4BitTo32BitPal_c; +// TVPBLExpand8BitTo8BitPal, TVPBLExpand8BitTo8BitPal_c;uni +// TVPBLExpand8BitTo32BitPal, TVPBLExpand8BitTo32BitPal_c; + + REGISTER_TVPGL_CUSTOM_FUNC(TVPExpand8BitTo32BitGray, TVPExpand8BitTo32BitGray, testrule, 256 * 256); +// TVPBLConvert15BitTo8Bit, TVPBLConvert15BitTo8Bit; + REGISTER_TVPGL_CUSTOM_FUNC(TVPBLConvert15BitTo32Bit, TVPBLConvert15BitTo32Bit, (const tjs_uint16*)testrule, 128 * 256); +// TVPBLConvert24BitTo8Bit, TVPBLConvert24BitTo8Bit; + REGISTER_TVPGL_ONLY(TVPBLConvert24BitTo32Bit, TVPConvert24BitTo32Bit_NEON); + REGISTER_TVPGL_CUSTOM_FUNC(TVPConvert24BitTo32Bit, TVPConvert24BitTo32Bit, testrule, 256 * 256 / 3); + REGISTER_TVPGL_ONLY(TVPConvert32BitTo24Bit, TVPConvert32BitTo24Bit); +// TVPBLConvert32BitTo8Bit, TVPBLConvert32BitTo8Bit; +// TVPBLConvert32BitTo32Bit_NoneAlpha, TVPBLConvert32BitTo32Bit_NoneAlpha; +// TVPBLConvert32BitTo32Bit_MulAddAlpha, TVPBLConvert32BitTo32Bit_MulAddAlpha; +// TVPBLConvert32BitTo32Bit_AddAlpha, TVPBLConvert32BitTo32Bit_AddAlpha; +// TVPDither32BitTo16Bit565, TVPDither32BitTo16Bit565; +// TVPDither32BitTo16Bit555, TVPDither32BitTo16Bit555; +// TVPDither32BitTo8Bit, TVPDither32BitTo8Bit; +// TVPTLG5DecompressSlide, TVPTLG5DecompressSlide; +// TVPTLG6DecodeGolombValuesForFirst, TVPTLG6DecodeGolombValuesForFirst; +// TVPTLG6DecodeGolombValues, TVPTLG6DecodeGolombValues; + + SHOW_AND_CLEAR_LOG; + + REGISTER_TVPGL_BLEND_FUNC_2(TVPPsAlphaBlend_HDA, TVPPsAlphaBlend); + REGISTER_TVPGL_ONLY(TVPPsAlphaBlend_HDA, TVPPsAlphaBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPPsAlphaBlend_HDA_o, TVPPsAlphaBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPPsAlphaBlend_HDA_o, TVPPsAlphaBlend_o_NEON); + + REGISTER_TVPGL_BLEND_FUNC_2(TVPPsAddBlend_HDA, TVPPsAddBlend); + REGISTER_TVPGL_ONLY(TVPPsAddBlend_HDA, TVPPsAddBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPPsAddBlend_HDA_o, TVPPsAddBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPPsAddBlend_HDA_o, TVPPsAddBlend_o_NEON); + + REGISTER_TVPGL_BLEND_FUNC_2(TVPPsSubBlend_HDA, TVPPsSubBlend); + REGISTER_TVPGL_ONLY(TVPPsSubBlend_HDA, TVPPsSubBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPPsSubBlend_HDA_o, TVPPsSubBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPPsSubBlend_HDA_o, TVPPsSubBlend_o_NEON); + + REGISTER_TVPGL_BLEND_FUNC_2(TVPPsMulBlend_HDA, TVPPsMulBlend); + REGISTER_TVPGL_ONLY(TVPPsMulBlend_HDA, TVPPsMulBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPPsMulBlend_HDA_o, TVPPsMulBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPPsMulBlend_HDA_o, TVPPsMulBlend_o_NEON); + + SHOW_AND_CLEAR_LOG; + + REGISTER_TVPGL_BLEND_FUNC_2(TVPPsScreenBlend_HDA, TVPPsScreenBlend); + REGISTER_TVPGL_ONLY(TVPPsScreenBlend_HDA, TVPPsScreenBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPPsScreenBlend_HDA_o, TVPPsScreenBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPPsScreenBlend_HDA_o, TVPPsScreenBlend_o_NEON); + + REGISTER_TVPGL_BLEND_FUNC_2(TVPPsOverlayBlend_HDA, TVPPsOverlayBlend); + REGISTER_TVPGL_ONLY(TVPPsOverlayBlend, TVPPsOverlayBlend_NEON); + REGISTER_TVPGL_ONLY(TVPPsOverlayBlend_HDA, TVPPsOverlayBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPPsOverlayBlend_HDA_o, TVPPsOverlayBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPPsOverlayBlend_o, TVPPsOverlayBlend_o_NEON); + REGISTER_TVPGL_ONLY(TVPPsOverlayBlend_HDA_o, TVPPsOverlayBlend_o_NEON); + + REGISTER_TVPGL_BLEND_FUNC_2(TVPPsHardLightBlend_HDA, TVPPsHardLightBlend); + REGISTER_TVPGL_ONLY(TVPPsHardLightBlend, TVPPsHardLightBlend_NEON); + REGISTER_TVPGL_ONLY(TVPPsHardLightBlend_HDA, TVPPsHardLightBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPPsHardLightBlend_HDA_o, TVPPsHardLightBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPPsHardLightBlend_o, TVPPsHardLightBlend_o_NEON); + REGISTER_TVPGL_ONLY(TVPPsHardLightBlend_HDA_o, TVPPsHardLightBlend_o_NEON); + +// TVPPsSoftLightBlend = TVPPsSoftLightBlend_c; +// TVPPsSoftLightBlend_o = TVPPsSoftLightBlend_o_c; +// TVPPsSoftLightBlend_HDA = TVPPsSoftLightBlend_HDA_c; +// TVPPsSoftLightBlend_HDA_o = TVPPsSoftLightBlend_HDA_o_c; +// TVPPsColorDodgeBlend = TVPPsColorDodgeBlend_c; +// TVPPsColorDodgeBlend_o = TVPPsColorDodgeBlend_o_c; +// TVPPsColorDodgeBlend_HDA = TVPPsColorDodgeBlend_HDA_c; +// TVPPsColorDodgeBlend_HDA_o = TVPPsColorDodgeBlend_HDA_o_c; +// TVPPsColorDodge5Blend = TVPPsColorDodge5Blend_c; +// TVPPsColorDodge5Blend_o = TVPPsColorDodge5Blend_o_c; +// TVPPsColorDodge5Blend_HDA = TVPPsColorDodge5Blend_HDA_c; +// TVPPsColorDodge5Blend_HDA_o = TVPPsColorDodge5Blend_HDA_o_c; +// TVPPsColorBurnBlend = TVPPsColorBurnBlend_c; +// TVPPsColorBurnBlend_o = TVPPsColorBurnBlend_o_c; +// TVPPsColorBurnBlend_HDA = TVPPsColorBurnBlend_HDA_c; +// TVPPsColorBurnBlend_HDA_o = TVPPsColorBurnBlend_HDA_o_c; + + REGISTER_TVPGL_BLEND_FUNC_2(TVPPsLightenBlend_HDA, TVPPsLightenBlend); + REGISTER_TVPGL_ONLY(TVPPsLightenBlend_HDA, TVPPsLightenBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPPsLightenBlend_HDA_o, TVPPsLightenBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPPsLightenBlend_HDA_o, TVPPsLightenBlend_o_NEON); + + SHOW_AND_CLEAR_LOG; + + REGISTER_TVPGL_BLEND_FUNC_2(TVPPsDarkenBlend_HDA, TVPPsDarkenBlend); + REGISTER_TVPGL_ONLY(TVPPsDarkenBlend_HDA, TVPPsDarkenBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPPsDarkenBlend_HDA_o, TVPPsDarkenBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPPsDarkenBlend_HDA_o, TVPPsDarkenBlend_o_NEON); + + REGISTER_TVPGL_BLEND_FUNC_2(TVPPsDiffBlend_HDA, TVPPsDiffBlend); + REGISTER_TVPGL_ONLY(TVPPsDiffBlend_HDA, TVPPsDiffBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPPsDiffBlend_HDA_o, TVPPsDiffBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPPsDiffBlend_HDA_o, TVPPsDiffBlend_o_NEON); + + REGISTER_TVPGL_BLEND_FUNC_2(TVPPsDiff5Blend_HDA, TVPPsDiff5Blend); + REGISTER_TVPGL_ONLY(TVPPsDiff5Blend_HDA, TVPPsDiff5Blend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPPsDiff5Blend_HDA_o, TVPPsDiff5Blend_o, 100); + REGISTER_TVPGL_ONLY(TVPPsDiff5Blend_HDA_o, TVPPsDiff5Blend_o_NEON); + + REGISTER_TVPGL_BLEND_FUNC_2(TVPPsExclusionBlend_HDA, TVPPsExclusionBlend); + REGISTER_TVPGL_ONLY(TVPPsExclusionBlend_HDA, TVPPsExclusionBlend_NEON); + REGISTER_TVPGL_BLEND_FUNC(TVPPsExclusionBlend_HDA_o, TVPPsExclusionBlend_o, 100); + REGISTER_TVPGL_ONLY(TVPPsExclusionBlend_HDA_o, TVPPsExclusionBlend_o_NEON); + + REGISTER_TVPGL_ONLY(TVPTLG6DecodeLine, TVPTLG6DecodeLine_NEON); + REGISTER_TVPGL_ONLY(TVPTLG5ComposeColors3To4, TVPTLG5ComposeColors3To4_NEON); + REGISTER_TVPGL_ONLY(TVPTLG5ComposeColors4To4, TVPTLG5ComposeColors4To4_NEON); + REGISTER_TVPGL_ONLY(TVPTLG5DecompressSlide, TVPTLG5DecompressSlide_NEON); + + REGISTER_TVPGL_ONLY(TVPReverseRGB, TVPReverseRGB_NEON); + REGISTER_TVPGL_ONLY(TVPUpscale65_255, TVPUpscale65_255_NEON); + + SHOW_AND_CLEAR_LOG; +#endif +#ifdef DEBUG_ARM_NEON + free(testbuff); +#ifdef _DEBUG + TVPInitTVPGL(); +#endif +#endif + } +} + +FUNC_API void TVPGL_ASM_Test() { +#ifdef LOG_NEON_TEST + TVPCPUFeatures |= TVP_CPU_FAMILY_ARM | TVP_CPU_HAS_NEON; + TVPGL_ASM_Init(); +#endif +} \ No newline at end of file diff --git a/src/core/visual/FontImpl.cpp b/src/core/visual/FontImpl.cpp index acdf020c..e062e4be 100644 --- a/src/core/visual/FontImpl.cpp +++ b/src/core/visual/FontImpl.cpp @@ -14,7 +14,7 @@ #endif #ifdef _MSC_VER -#pragma comment(lib,"freetype250.lib") +#pragma comment(lib,"freetype.lib") #endif #include "platform/CCFileUtils.h" #include @@ -166,27 +166,24 @@ void TVPInitFontNames() // set default fontface name TVPDefaultFontName = TVPFontNames.GetLast().GetKey(); } - class tLister : public iTVPStorageLister - { - public: - std::vector list; - void TJS_INTF_METHOD Add(const ttstr &file) - { - list.emplace_back(file); - } - } lister; -#ifdef __ANDROID__ - TVPGetListAt(Android_GetInternalStoragePath() + "/fonts", &lister); - for (const ttstr &path : pathlist) { - TVPGetListAt(path + "/fonts", &lister); - } -#endif // check exePath + "/fonts/*.ttf" { - TVPGetLocalFileListAt(TVPGetAppPath() + "/fonts", &lister, S_IFREG); - auto itend = lister.list.end(); - for (auto it = lister.list.begin(); it != itend; ++it) { + std::vector list; + auto lister = [&](const ttstr &name, tTVPLocalFileInfo* s) { + if (s->Mode & (S_IFREG | S_IFDIR)) { + list.emplace_back(name); + } + }; +#ifdef __ANDROID__ + TVPGetLocalFileListAt(Android_GetInternalStoragePath() + "/fonts", lister); + for (const ttstr &path : pathlist) { + TVPGetLocalFileListAt(path + "/fonts", lister); + } +#endif + TVPGetLocalFileListAt(TVPGetAppPath() + "/fonts", lister); + auto itend = list.end(); + for (auto it = list.begin(); it != itend; ++it) { TVPEnumFontsProc(*it); } } diff --git a/src/core/visual/LayerBitmapIntf.cpp b/src/core/visual/LayerBitmapIntf.cpp index cc49c43e..e982e225 100644 --- a/src/core/visual/LayerBitmapIntf.cpp +++ b/src/core/visual/LayerBitmapIntf.cpp @@ -1780,12 +1780,12 @@ bool iTVPBaseBitmap::Blt(tjs_int x, tjs_int y, const iTVPBaseBitmap *ref, case ltOpaque: // formerly ltCoverRect // copy - met = opa == 255 ? bmCopyOnAlpha : bmCopy; + met = opa == 255 ? bmCopy : bmCopyOnAlpha; break; case ltAlpha: // formerly ltTransparent // alpha blend - met = bmAlpha; + met = hda ? bmAlpha : bmAlphaOnAlpha; break; case ltAdditive: diff --git a/src/core/visual/LayerIntf.cpp b/src/core/visual/LayerIntf.cpp index eaf74a6e..ffe0efca 100644 --- a/src/core/visual/LayerIntf.cpp +++ b/src/core/visual/LayerIntf.cpp @@ -55,6 +55,11 @@ bool TVPFreeUnusedLayerCache = false; // (layer cache is not freed until system compact event if this is false) //--------------------------------------------------------------------------- +static bool IsGPU() { + static bool isGPU = !TVPIsSoftwareRenderManager() + && !IndividualConfigManager::GetInstance()->GetValue("ogl_accurate_render", false); + return isGPU; +} //--------------------------------------------------------------------------- // temporary bitmap management @@ -294,7 +299,11 @@ void TVPTempBitmapHolderRelease() //--------------------------------------------------------------------------- // global options //--------------------------------------------------------------------------- -tTVPGraphicSplitOperationType TVPGraphicSplitOperationType = gsotNone;// gsotSimple; +#ifdef _DEBUG +tTVPGraphicSplitOperationType TVPGraphicSplitOperationType = gsotNone; +#else +tTVPGraphicSplitOperationType TVPGraphicSplitOperationType = gsotSimple; +#endif bool TVPDefaultHoldAlpha = false; //--------------------------------------------------------------------------- @@ -4144,7 +4153,11 @@ void tTJSNI_BaseLayer::PiledCopy(tjs_int dx, tjs_int dy, tTJSNI_BaseLayer *src, try { iTVPBaseBitmap *bmp = src->Complete(rect); - ImageModified = MainImage->CopyRect(dx, dy, bmp, rect, + tTVPRect rc(rect); + if (IsGPU()) { + rc.set_offsets(0, 0); + } + ImageModified = MainImage->CopyRect(dx, dy, bmp, rc, TVP_BB_COPY_MAIN|TVP_BB_COPY_MASK) || ImageModified; } catch(...) @@ -5966,10 +5979,13 @@ void tTJSNI_BaseLayer::EffectImage(iTVPBaseBitmap *dest, const tTVPRect & destre void tTJSNI_BaseLayer::Draw_GPU(tTVPDrawable *target, int x, int y, const tTVPRect &r, bool visiblecheck) { if(visiblecheck && !IsSeen()) return; - tTVPRect rect; + tTVPRect rect; if(!TVPIntersectRect(&rect, r, Rect)) return; // no intersection + x += rect.left - r.left; + y += rect.top - r.top; + tTVPRect rctar(rect); - rctar.add_offsets(x, y); + rctar.set_offsets(x, y); CurrentDrawTarget = target; @@ -5991,27 +6007,30 @@ void tTJSNI_BaseLayer::Draw_GPU(tTVPDrawable *target, int x, int y, const tTVPRe DrawSelf(target, rctar, rect); } else { // rearrange pipe line for transition + bool useTemp = false; if (GetCacheEnabled()) { UpdateBitmapForChild = CacheBitmap; } else { + useTemp = true; UpdateBitmapForChild = tTVPTempBitmapHolder::GetTemp( rect.get_width(), rect.get_height()); } // copy self image to UpdateBitmapForChild if (MainImage != NULL) { - if (UpdateExcludeRect.top <= rect.top && UpdateExcludeRect.bottom >= rect.bottom && - rect.left >= UpdateExcludeRect.left && rect.right <= UpdateExcludeRect.right) { - } else +// if (UpdateExcludeRect.top <= rect.top && UpdateExcludeRect.bottom >= rect.bottom && +// rect.left >= UpdateExcludeRect.left && rect.right <= UpdateExcludeRect.right) { +// } else CopySelfForRect(UpdateBitmapForChild, 0, 0, rect); // transfer self image } - x = 0; - y = 0; TVP_LAYER_FOR_EACH_CHILD_BEGIN(child) { // for each child... + // visible check + if (!child->Visible) continue; + // intersection check if (!TVPIntersectRect(&UpdateRectForChild, rect, child->Rect)) continue; @@ -6019,15 +6038,16 @@ void tTJSNI_BaseLayer::Draw_GPU(tTVPDrawable *target, int x, int y, const tTVPRe // setup UpdateOfsX/Y UpdateRectForChildOfsX/Y UpdateOfsX = 0; UpdateOfsY = 0; - UpdateRectForChildOfsX = UpdateRectForChild.left - child->Rect.left; - UpdateRectForChildOfsY = UpdateRectForChild.top - child->Rect.top; + UpdateRectForChildOfsX = UpdateRectForChild.left - child->Rect.left; + UpdateRectForChildOfsY = UpdateRectForChild.top - child->Rect.top; // call children's "Draw" method - child->Draw_GPU((tTVPDrawable*)this, x, y, UpdateRectForChild); + child->Draw_GPU((tTVPDrawable*)this, UpdateRectForChild.left, UpdateRectForChild.top, UpdateRectForChild); } TVP_LAYER_FOR_EACH_CHILD_END rect.set_offsets(0, 0); target->DrawCompleted(rctar, UpdateBitmapForChild, rect, DisplayType, Opacity); + if (useTemp) tTVPTempBitmapHolder::FreeTemp(); } } else { if (GetVisibleChildrenCount() == 0) { @@ -6036,14 +6056,14 @@ void tTJSNI_BaseLayer::Draw_GPU(tTVPDrawable *target, int x, int y, const tTVPRe DrawnRegion.Clear(); // send completion message to the target - if (UpdateExcludeRect.top <= rect.top && UpdateExcludeRect.bottom >= rect.bottom && - rect.left >= UpdateExcludeRect.left && rect.right <= UpdateExcludeRect.right) { - } else { +// if (UpdateExcludeRect.top <= rect.top && UpdateExcludeRect.bottom >= rect.bottom && +// rect.left >= UpdateExcludeRect.left && rect.right <= UpdateExcludeRect.right) { +// } else + { tTVPRect rc(rect); DrawSelf(target, rctar, rc); } - x += Rect.left; - y += Rect.top; + TVP_LAYER_FOR_EACH_CHILD_BEGIN(child) { // for each child... @@ -6057,7 +6077,7 @@ void tTJSNI_BaseLayer::Draw_GPU(tTVPDrawable *target, int x, int y, const tTVPRe continue; // call children's "Draw" method - child->Draw_GPU(target, x, y, chrect); + child->Draw_GPU(target, x, y, rect); } TVP_LAYER_FOR_EACH_CHILD_END } @@ -6682,11 +6702,11 @@ void tTJSNI_BaseLayer::InternalComplete2(tTVPComplexRect & updateregion, { // split tjs_int rw = r.get_width(); - if(rw < 40) oh = 128; - else if(rw < 80) oh = 64; - else if(rw < 160) oh = 32; - else if(rw < 320) oh = 16; - else oh = 8; + if(rw < 40) oh = 256; + else if(rw < 80) oh = 128; + else if(rw < 160) oh = 64; + else if(rw < 320) oh = 32; + else oh = 16; // 2 lines per core in modern 8 cores cpu } else { @@ -6793,10 +6813,7 @@ void tTJSNI_BaseLayer::InternalComplete(tTVPComplexRect & updateregion, // at this point, final update region (in this completion) is determined InCompletion = true; - static bool isGPU = !TVPIsSoftwareRenderManager() - && !IndividualConfigManager::GetInstance()->GetValue("ogl_accurate_render", false); - - if (isGPU) { + if (IsGPU()) { InternalComplete2_GPU(updateregion.GetBound(), drawable); } else { InternalComplete2(updateregion, drawable); @@ -6813,13 +6830,11 @@ void tTJSNI_BaseLayer::CompleteForWindow(tTVPDrawable *drawable) if(Manager) Manager->NotifyUpdateRegionFixed(); InCompletion = true; - static bool isGPU = !TVPIsSoftwareRenderManager() - && !IndividualConfigManager::GetInstance()->GetValue("ogl_accurate_render", false); if(Manager) Manager->GetLayerTreeOwner()->StartBitmapCompletion(Manager); try { - if (isGPU) { + if (IsGPU()) { InternalComplete2_GPU(Rect, drawable); } else { InternalComplete2(Manager->GetUpdateRegionForCompletion(), drawable); @@ -6901,11 +6916,9 @@ tTVPBaseTexture * tTJSNI_BaseLayer::Complete(const tTVPRect & rect) return CacheBitmap; } - static bool isGPU = !TVPIsSoftwareRenderManager() - && !IndividualConfigManager::GetInstance()->GetValue("ogl_accurate_render", false); tTVPComplexRect ur; ur.Or(rect); - if (isGPU) { + if (IsGPU()) { tCompleteDrawable_GPU drawable(CacheBitmap, DisplayType); InternalComplete(ur, &drawable); // complete cache } else { diff --git a/src/core/visual/LayerManager.cpp b/src/core/visual/LayerManager.cpp index 0fe7d357..efdb5a13 100644 --- a/src/core/visual/LayerManager.cpp +++ b/src/core/visual/LayerManager.cpp @@ -75,6 +75,18 @@ void tTVPLayerManager::UnregisterSelfFromWindow() { LayerTreeOwner->UnregisterLayerManager(this); } + +void tTVPLayerManager::SetHoldAlpha(bool b) +{ + HoldAlpha = b; + if (!DrawBuffer) { + tjs_int w, h; + if (!GetPrimaryLayerSize(w, h)) return; + DrawBuffer = new tTVPDestTexture(w, h); + } + static_cast(DrawBuffer)->SetHoldAlpha(b); +} + //--------------------------------------------------------------------------- tTVPBaseTexture * tTVPLayerManager::GetDrawTargetBitmap(const tTVPRect &rect, tTVPRect &cliprect) @@ -139,7 +151,7 @@ void tTVPLayerManager::DrawCompleted(const tTVPRect &destrect, } } - DrawBuffer->Blt(destrect.left, destrect.top, bmp, cliprect, type, opacity, true); + DrawBuffer->Blt(destrect.left, destrect.top, bmp, cliprect, type, opacity, HoldAlpha); #endif } //--------------------------------------------------------------------------- @@ -1122,10 +1134,9 @@ void TJS_INTF_METHOD tTVPLayerManager::DumpLayerStructure() bool tTVPDestTexture::CopyRect(tjs_int x, tjs_int y, const iTVPBaseBitmap *ref, tTVPRect refrect, tjs_int plane /*= (TVP_BB_COPY_MAIN | TVP_BB_COPY_MASK)*/) { - return tTVPBaseTexture::CopyRect(x, y, ref, refrect, TVP_BB_COPY_MAIN); -} - -bool tTVPDestBitmap::CopyRect(tjs_int x, tjs_int y, const iTVPBaseBitmap *ref, tTVPRect refrect, tjs_int plane /*= (TVP_BB_COPY_MAIN | TVP_BB_COPY_MASK)*/) -{ - return tTVPBaseBitmap::CopyRect(x, y, ref, refrect, TVP_BB_COPY_MAIN); + if (HoldAlpha) { + return tTVPBaseTexture::CopyRect(x, y, ref, refrect, TVP_BB_COPY_MAIN); + } else { + return tTVPBaseTexture::CopyRect(x, y, ref, refrect, plane); + } } diff --git a/src/core/visual/LayerManager.h b/src/core/visual/LayerManager.h index 99108c84..e7da4318 100644 --- a/src/core/visual/LayerManager.h +++ b/src/core/visual/LayerManager.h @@ -219,6 +219,8 @@ struct tTVPTouchCaptureLayer { // texture for last render target class tTVPDestTexture : public tTVPBaseTexture { + bool HoldAlpha = true; + public: tTVPDestTexture(tjs_uint w, tjs_uint h) : tTVPBaseTexture(w, h) {} @@ -226,14 +228,8 @@ class tTVPDestTexture : public tTVPBaseTexture // tTVPRect refrect, tTVPBBBltMethod method, tjs_int opa); virtual bool CopyRect(tjs_int x, tjs_int y, const iTVPBaseBitmap *ref, tTVPRect refrect, tjs_int plane = (TVP_BB_COPY_MAIN | TVP_BB_COPY_MASK)); -}; -class tTVPDestBitmap : public tTVPBaseBitmap -{ -public: - tTVPDestBitmap(tjs_uint w, tjs_uint h) : tTVPBaseBitmap(w, h) {} - virtual bool CopyRect(tjs_int x, tjs_int y, const iTVPBaseBitmap *ref, - tTVPRect refrect, tjs_int plane = (TVP_BB_COPY_MAIN | TVP_BB_COPY_MASK)); + void SetHoldAlpha(bool b) { HoldAlpha = b; } }; //--------------------------------------------------------------------------- @@ -278,7 +274,7 @@ class tTVPLayerManager : public iTVPLayerManager, public tTVPDrawable bool ReleaseCaptureCalled; bool InNotifyingHintOrCursorChange; - + bool HoldAlpha = true; public: tTVPLayerManager(class iTVPLayerTreeOwner *owner); @@ -297,6 +293,7 @@ class tTVPLayerManager : public iTVPLayerManager, public tTVPDrawable public: virtual void TJS_INTF_METHOD SetDesiredLayerType(tTVPLayerType type) { DesiredLayerType = type; } + void SetHoldAlpha(bool b); public: // methods from tTVPDrawable virtual tTVPBaseTexture * GetDrawTargetBitmap(const tTVPRect &rect, diff --git a/src/core/visual/LoadJXR.cpp b/src/core/visual/LoadJXR.cpp index 944140fd..6496aade 100644 --- a/src/core/visual/LoadJXR.cpp +++ b/src/core/visual/LoadJXR.cpp @@ -405,9 +405,9 @@ external/jxrlib/jxrgluelib/JXRGlueLib_vc11.vcxproj */ #if defined( WIN32 ) #ifdef _DEBUG -#pragma comment(lib, "JXRCommonLib_d.lib") -#pragma comment(lib, "JXRDecodeLib_d.lib") -#pragma comment(lib, "JXREncodeLib_d.lib") +// #pragma comment(lib, "JXRCommonLib_d.lib") +// #pragma comment(lib, "JXRDecodeLib_d.lib") +// #pragma comment(lib, "JXREncodeLib_d.lib") #pragma comment(lib, "JXRGlueLib_d.lib") #else #pragma comment(lib, "JXRCommonLib.lib") diff --git a/src/core/visual/Resampler.cpp b/src/core/visual/Resampler.cpp deleted file mode 100644 index c323731f..00000000 --- a/src/core/visual/Resampler.cpp +++ /dev/null @@ -1,466 +0,0 @@ -//--------------------------------------------------------------------------- -/* - TVP2 ( T Visual Presenter 2 ) A script authoring tool - Copyright (C) 2000-2007 W.Dee and contributors - - See details of license at "license.txt" -*/ -//--------------------------------------------------------------------------- -// Image resampler -//--------------------------------------------------------------------------- -/* - based on Graphics Gems III - "Filtered Image Rescaling" by Dale Schumacher -*/ -//--------------------------------------------------------------------------- -#include "tjsCommHead.h" - -#include -#include "LayerBitmapIntf.h" -#include "MsgIntf.h" -#include "RenderManager.h" - -//--------------------------------------------------------------------------- -typedef float real_t; -//--------------------------------------------------------------------------- -#pragma pack(push,1) -typedef struct -{ - unsigned char r; - unsigned char g; - unsigned char b; - unsigned char a; -} pixel_t; -#pragma pack(pop) -typedef struct -{ - real_t r; - real_t g; - real_t b; - real_t a; -} pixel_real_t_t; -//--------------------------------------------------------------------------- -static inline int CLAMP(int v, int l, int h) - { return ((v)<(l) ? (l) : (v) > (h) ? (h) : (v)); } -//--------------------------------------------------------------------------- -typedef struct { - int pixel; - real_t weight; -} CONTRIB; - -typedef struct { - int n; /* number of contributors */ - CONTRIB *p; /* pointer to list of contributions */ -} CLIST; -//--------------------------------------------------------------------------- -static int roundcloser(real_t d) -{ - /* return fabs(ceil(d)-d) <= 0.5 ? ceil(d) : floor(d); */ - - int n = (int) d; - real_t diff = d - (real_t)n; - if(diff < 0) - diff = -diff; - if(diff >= 0.5) - { - if(d < 0) - n--; - else - n++; - } - return n; -} /* roundcloser */ -//--------------------------------------------------------------------------- -static int calc_x_contrib(CLIST *contribX, real_t xscale, real_t fwidth, int dstwidth, - int srcwidth, real_t (*filterf)(real_t), int i) -{ - real_t width; - real_t fscale; - real_t center, left, right; - real_t weight; - int j, k, n; - - if(xscale < 1.0) - { - /* Shrinking image */ - real_t w_sum = 0; - width = fwidth / xscale; -// fscale = 1.0 / xscale; - - contribX->n = 0; - contribX->p = (CONTRIB *)calloc((int) (width * 2 + 2), - sizeof(CONTRIB)); - if(contribX->p == NULL) - return -1; - - center = (real_t) i / xscale; - left = ceil(center - width); - right = floor(center + width); - for(j = (int)left; j <= right; ++j) - { - weight = center - (real_t) j; - weight = (*filterf)(weight * xscale); // / fscale; - w_sum += weight; - if(j < 0) - n = -j; - else if(j >= srcwidth) - n = (srcwidth - j) + srcwidth - 1; - else - n = j; - - if(n < 0) n = 0; - if(n >= srcwidth - 1) n = srcwidth - 1; - - k = contribX->n++; - contribX->p[k].pixel = n; - contribX->p[k].weight = weight; - } - - if(w_sum != 0.0) - { - w_sum = 1.0 / w_sum; - for(j = 0; j < contribX->n; j ++) - contribX->p[j].weight *= w_sum; - } - - } - else - { - /* Expanding image */ - contribX->n = 0; - contribX->p = (CONTRIB *)calloc((int) (fwidth * 2 + 1), - sizeof(CONTRIB)); - if(contribX->p == NULL) - return -1; - center = (real_t) i / xscale; - left = ceil(center - fwidth); - right = floor(center + fwidth); - - for(j = (int)left; j <= right; ++j) - { - weight = center - (real_t) j; - weight = (*filterf)(weight); - if(j < 0) { - n = -j; - } else if(j >= srcwidth) { - n = (srcwidth - j) + srcwidth - 1; - } else { - n = j; - } - - if(n < 0) n = 0; - if(n >= srcwidth - 1) n = srcwidth - 1; - - k = contribX->n++; - contribX->p[k].pixel = n; - contribX->p[k].weight = weight; - } - } - return 0; -} /* calc_x_contrib */ -//--------------------------------------------------------------------------- -static int ib_resample(iTVPBaseBitmap *dst, - const tTVPRect &destrect, - const iTVPBaseBitmap *src, - const tTVPRect &srcrect, - real_t (*filterf)(real_t), -// void (*blend)(tjs_uint32 *dest, tjs_uint32 src), - real_t fwidth) -{ - pixel_t * tmp; - real_t xscale, yscale; /* zoom scale factors */ - int xx; - int i, j, k; /* loop variables */ - int n; /* pixel number */ - real_t center, left, right; /* filter calculation variables */ - real_t width, fscale; /* filter calculation variables */ - pixel_real_t_t weight; - pixel_t pel, pel2; - pixel_t bPelDelta; - CLIST *contribY; /* array of contribution lists */ - CLIST contribX; - int nRet = -1; - int srcwidth = srcrect.get_width(); - int srcheight = srcrect.get_height(); - int destwidth = destrect.get_width(); - int destheight = destrect.get_height(); - - /* create intermediate column to hold horizontal dst column zoom */ - tmp = (pixel_t*)malloc(srcheight * sizeof(pixel_t)); - if(tmp == NULL) - return 0; - - xscale = (real_t) destwidth / (real_t) srcwidth; - - /* Build y weights */ - /* pre-calculate filter contributions for a column */ - contribY = (CLIST *)calloc(destheight, sizeof(CLIST)); - if(contribY == NULL) - { - free(tmp); - return -1; - } - - yscale = (real_t) destheight / (real_t) srcheight; - - if(yscale < 1.0) - { - real_t weight; - width = fwidth / yscale; - fscale = 1.0 / yscale; - for(i = 0; i < destheight; ++i) - { - real_t w_sum = 0; - - contribY[i].n = 0; - contribY[i].p = (CONTRIB *)calloc((int) (width * 2 + 1), - sizeof(CONTRIB)); - if(contribY[i].p == NULL) - { - free(tmp); - free(contribY); - return -1; - } - center = (real_t) i * fscale; - left = ceil(center - width); - right = floor(center + width); - for(j = (int)left; j <= right; ++j) - { - weight = center - (real_t) j; - weight = (*filterf)(weight * yscale); //* yscale; - w_sum += weight; - if(j < 0) - { - n = -j; - } - else if(j >= srcheight) - { - n = (srcheight - j) + srcheight - 1; - } - else - { - n = j; - } - if(n < 0) n = 0; - if(n >= srcheight - 1) n = srcheight - 1; - k = contribY[i].n++; - contribY[i].p[k].pixel = n; - contribY[i].p[k].weight = weight; - } - - if(w_sum != 0.0) - { - w_sum = 1.0 / w_sum; - for(j = 0; j < contribY[i].n; j ++) - contribY[i].p[j].weight *= w_sum; - } - - } - } - else - { - real_t weight; - for(i = 0; i < destheight; ++i) - { - contribY[i].n = 0; - contribY[i].p = (CONTRIB *)calloc((int) (fwidth * 2 + 1), - sizeof(CONTRIB)); - if(contribY[i].p == NULL) - { - free(tmp); - free(contribY); - return -1; - } - center = (real_t) i / yscale; - left = ceil(center - fwidth); - right = floor(center + fwidth); - for(j = (int)left; j <= right; ++j) - { - weight = center - (real_t) j; - weight = (*filterf)(weight); - if(j < 0) - { - n = -j; - } - else if(j >= srcheight) - { - n = (srcheight - j) + srcheight - 1; - } - else - { - n = j; - } - if(n < 0) n = 0; - if(n >= srcheight - 1) n = srcheight - 1; - k = contribY[i].n++; - contribY[i].p[k].pixel = n; - contribY[i].p[k].weight = weight; - } - } - } - - - tjs_int srcpitchbytes = src->GetPitchBytes(); - const pixel_t *srclinestart = (const pixel_t*) - ((const tjs_uint8*)src->GetTexture()->GetPixelData() + srcrect.top * srcpitchbytes) + srcrect.left; - - pixel_t *destlinestart = (pixel_t*)dst->GetScanLineForWrite(destrect.top) + destrect.left; - tjs_int destpitchbytes = dst->GetPitchBytes(); - - for(xx = 0; xx < destwidth; xx++) - { - if(0 != calc_x_contrib(&contribX, xscale, fwidth, - destwidth, srcwidth, filterf, xx)) - { - goto __zoom_cleanup; - } - /* Apply horz filter to make dst column in tmp. */ - { - const pixel_t *line = srclinestart; - for(k = 0; k < srcheight; ++k) - { - weight.r = weight.g = weight.b = weight.a = 0; - bPelDelta.r = bPelDelta.g = bPelDelta.b = bPelDelta.a = 0; - pel = line[contribX.p[0].pixel]; - for(j = contribX.n - 1; j >= 0; --j) - { - CONTRIB *c = contribX.p + j; - pel2 = line[c->pixel]; - bPelDelta.r |= (pel2.r - pel.r); - bPelDelta.b |= (pel2.b - pel.b); - bPelDelta.g |= (pel2.g - pel.g); - bPelDelta.a |= (pel2.a - pel.a); - weight.r += pel2.r * c->weight; - weight.g += pel2.g * c->weight; - weight.b += pel2.b * c->weight; - weight.a += pel2.a * c->weight; - } - weight.r = bPelDelta.r ? roundcloser(weight.r) : pel.r; - weight.g = bPelDelta.g ? roundcloser(weight.g) : pel.g; - weight.b = bPelDelta.b ? roundcloser(weight.b) : pel.b; - weight.a = bPelDelta.a ? roundcloser(weight.a) : pel.a; - - tmp[k].r = (unsigned char)CLAMP(weight.r, 0, 255); - tmp[k].g = (unsigned char)CLAMP(weight.g, 0, 255); - tmp[k].b = (unsigned char)CLAMP(weight.b, 0, 255); - tmp[k].a = (unsigned char)CLAMP(weight.a, 0, 255); - - *(tjs_uint8**)(&line) += srcpitchbytes; - } /* next row in temp column */ - } - - free(contribX.p); - - /* The temp column has been built. Now stretch it - vertically into dst column. */ - { - pixel_t *line = destlinestart; - for(i = 0; i < destheight; ++i) - { - CLIST *cl = contribY + i; - weight.r = weight.g = weight.b = weight.a = 0; - bPelDelta.r = bPelDelta.g = bPelDelta.b = bPelDelta.a = 0; - pel = tmp[cl->p[0].pixel]; - - for(j = cl->n - 1; j >= 0; --j) - { - CONTRIB *c = cl->p + j; - pel2 = tmp[c->pixel]; - bPelDelta.r |= (pel2.r - pel.r); - bPelDelta.b |= (pel2.b - pel.b); - bPelDelta.g |= (pel2.g - pel.g); - bPelDelta.a |= (pel2.a - pel.a); - weight.r += pel2.r * c->weight; - weight.g += pel2.g * c->weight; - weight.b += pel2.b * c->weight; - weight.a += pel2.a * c->weight; - } - weight.r = bPelDelta.r ? roundcloser(weight.r) : pel.r; - weight.g = bPelDelta.g ? roundcloser(weight.g) : pel.g; - weight.b = bPelDelta.b ? roundcloser(weight.b) : pel.b; - weight.a = bPelDelta.a ? roundcloser(weight.a) : pel.a; - line[xx].r = (unsigned char)CLAMP(weight.r, 0, 255); - line[xx].g = (unsigned char)CLAMP(weight.g, 0, 255); - line[xx].b = (unsigned char)CLAMP(weight.b, 0, 255); - line[xx].a = (unsigned char)CLAMP(weight.a, 0, 255); - - *(tjs_uint8**)(&line) += destpitchbytes; - } /* next dst row */ - } - } /* next dst column */ - nRet = 0; /* success */ - -__zoom_cleanup: - free(tmp); - - /* free the memory allocated for vertical filter weights */ - for(i = 0; i < destheight; ++i) - free(contribY[i].p); - free(contribY); - - return nRet; -} /* ib_resample */ -//--------------------------------------------------------------------------- -#define filter_support (1.0) -real_t -filter(real_t t) -{ - /* f(t) = 2|t|^3 - 3|t|^2 + 1, -1 <= t <= 1 */ - if(t < 0.0) t = -t; - if(t < 1.0) return((2.0 * t - 3.0) * t * t + 1.0); - return(0.0); -} - -#define box_support (0.5) -real_t -box_filter(real_t t) -{ - if((t > -0.5) && (t <= 0.5)) return(1.0); - return(0.0); -} - - -#define triangle_support (1.0) -static real_t -triangle_filter(real_t t) -{ - if(t < 0.0) t = -t; - if(t < 1.0) return(1.0 - t); - return(0.0); -} - -#define bell_support (1.5) -static real_t -bell_filter(real_t t) /* box (*) box (*) box */ -{ - if(t < 0) t = -t; - if(t < 0.5) return(0.75 - (t * t)); - if(t < 1.5) { - t = (t - 1.5); - return(0.5 * (t * t)); - } - return(0.0); -} - - -//--------------------------------------------------------------------------- -void TVPResampleImage(const tTVPRect &cliprect, iTVPBaseBitmap *dest, const tTVPRect &destrect, const iTVPBaseBitmap *src, const tTVPRect &srcrect, - tTVPBBStretchType type, tjs_real typeopt, tTVPBBBltMethod method, tjs_int opa, bool hda) -{ - int ret = -1; - switch(mode) - { - case 1: // linear interpolation - default: - ret = ib_resample(dest, destrect, src, srcrect, triangle_filter, triangle_support); - break; - - case 2: // cubic interpolation - ret = ib_resample(dest, destrect, src, srcrect, bell_filter, bell_support); - break; - } - - if(ret) TVPThrowInternalError; -} -//--------------------------------------------------------------------------- - diff --git a/src/core/visual/Resampler.h b/src/core/visual/Resampler.h deleted file mode 100644 index bd2549d3..00000000 --- a/src/core/visual/Resampler.h +++ /dev/null @@ -1,21 +0,0 @@ -//--------------------------------------------------------------------------- -/* - TVP2 ( T Visual Presenter 2 ) A script authoring tool - Copyright (C) 2000-2007 W.Dee and contributors - - See details of license at "license.txt" -*/ -//--------------------------------------------------------------------------- -// Image resampler -//--------------------------------------------------------------------------- -#ifndef ResamplerH -#define ResamplerH - - -//--------------------------------------------------------------------------- -class iTVPBaseBitmap; -extern void TVPResampleImage(const tTVPRect &cliprect, iTVPBaseBitmap *dest, const tTVPRect &destrect, const iTVPBaseBitmap *src, const tTVPRect &srcrect, - tTVPBBStretchType type, tjs_real typeopt, tTVPBBBltMethod method, tjs_int opa, bool hda); -//--------------------------------------------------------------------------- - -#endif \ No newline at end of file diff --git a/src/core/visual/WindowIntf.h b/src/core/visual/WindowIntf.h index 8f33c70f..9c2bc90b 100644 --- a/src/core/visual/WindowIntf.h +++ b/src/core/visual/WindowIntf.h @@ -61,6 +61,7 @@ enum tTVPMouseCursorState //--------------------------------------------------------------------------- //! @brief Window basic interface //--------------------------------------------------------------------------- +class iWindowLayer; class iTVPWindow { public: @@ -130,6 +131,8 @@ class iTVPWindow //! @brief WindowのiTJSDispatch2インターフェースを取得する virtual iTJSDispatch2 * GetWindowDispatch() = 0; + // add by ZeaS + virtual iWindowLayer* GetForm() const = 0; }; //--------------------------------------------------------------------------- /*]*/ diff --git a/src/core/visual/gl/aligned_allocator.h b/src/core/visual/gl/aligned_allocator.h index eaa66992..ee998bfd 100644 --- a/src/core/visual/gl/aligned_allocator.h +++ b/src/core/visual/gl/aligned_allocator.h @@ -3,7 +3,6 @@ #ifndef __ALIGNED_ALLOCATOR_H__ #define __ALIGNED_ALLOCATOR_H__ -#include // _aligned_malloc and _aligned_free #include // std::allocator // STL allocator diff --git a/src/core/visual/ogl/RenderManager_ogl.cpp b/src/core/visual/ogl/RenderManager_ogl.cpp index f3b066c0..6ced7806 100644 --- a/src/core/visual/ogl/RenderManager_ogl.cpp +++ b/src/core/visual/ogl/RenderManager_ogl.cpp @@ -59,6 +59,27 @@ static void ShowInMessageBox(const char *format, ...) { } while (false) #endif +struct _UsedGLExtInfo { + _UsedGLExtInfo(){} + const char *NameBegin = nullptr; +#define _DEFEXT(name) #name +#define DEFEXT(name) const char *GLEXT_##name = _DEFEXT(GL_##name) + DEFEXT(EXT_unpack_subimage); + DEFEXT(EXT_shader_framebuffer_fetch); + DEFEXT(ARM_shader_framebuffer_fetch); + DEFEXT(NV_shader_framebuffer_fetch); + DEFEXT(EXT_copy_image); + DEFEXT(OES_copy_image); + DEFEXT(ARB_copy_image); + DEFEXT(NV_copy_image); + DEFEXT(EXT_clear_texture); + DEFEXT(ARB_clear_texture); + DEFEXT(QCOM_alpha_test); +#undef DEFEXT + const char *NameEnd = nullptr; +}; +static const _UsedGLExtInfo UsedGLExtInfo; + static std::unordered_set sTVPGLExtensions; // some quick check flags static bool GL_CHECK_unpack_subimage; @@ -67,10 +88,10 @@ static bool GL_CHECK_shader_framebuffer_fetch; bool TVPCheckGLExtension(const std::string &extname) { return sTVPGLExtensions.find(extname) != sTVPGLExtensions.end(); } +static bool TVPGLExtensionInfoInited = false; static void TVPInitGLExtensionInfo() { - static bool inited = false; - if (inited) return; - inited = true; + if (TVPGLExtensionInfoInited) return; + TVPGLExtensionInfoInited = true; std::string gl_extensions = (const char*)glGetString(GL_EXTENSIONS); const char *p = gl_extensions.c_str(); for (char &c : gl_extensions) { @@ -82,6 +103,14 @@ static void TVPInitGLExtensionInfo() { } } if (*p) sTVPGLExtensions.emplace(p); + IndividualConfigManager *cfgMgr = IndividualConfigManager::GetInstance(); + for (const char *const *name = (&UsedGLExtInfo.NameBegin) + 1; *name; ++name) { + if (!cfgMgr->GetValue(*name, 1)) { +#ifndef _MSC_VER + sTVPGLExtensions.erase(*name); +#endif + } + } #ifdef WIN32 sTVPGLExtensions.erase("GL_EXT_unpack_subimage"); #endif @@ -90,11 +119,6 @@ static void TVPInitGLExtensionInfo() { cocos2d::log("%s", line.c_str()); } #endif - GL_CHECK_unpack_subimage = TVPCheckGLExtension("GL_EXT_unpack_subimage"); - GL_CHECK_shader_framebuffer_fetch = - TVPCheckGLExtension("GL_EXT_shader_framebuffer_fetch") || - TVPCheckGLExtension("GL_ARM_shader_framebuffer_fetch") || - TVPCheckGLExtension("GL_NV_shader_framebuffer_fetch"); } namespace GL { // independ from global gl functions @@ -103,10 +127,12 @@ namespace GL { // independ from global gl functions #endif #ifdef _MSC_VER typedef PROC (WINAPI fGetProcAddress)(LPCSTR); +#elif defined(TARGET_OS_IPHONE) +typedef void* (fGetProcAddress)(const char *); #else typedef void* (EGLAPIENTRY fGetProcAddress)(const char *); #endif -static fGetProcAddress *glGetProcAddress; +static fGetProcAddress *glGetProcAddress = nullptr; typedef void (GLAPIENTRY fCopyImageSubData)(GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei width, GLsizei height, GLsizei depth); static fCopyImageSubData *glCopyImageSubData; @@ -124,27 +150,6 @@ typedef void (GLAPIENTRY fAlphaFunc)(GLenum func, GLclampf ref); static fAlphaFunc *glAlphaFunc; } -struct _UsedGLExtInfo { - _UsedGLExtInfo(){} - const char *NameBegin = nullptr; -#define _DEFEXT(name) #name -#define DEFEXT(name) const char *GLEXT_##name = _DEFEXT(GL_##name) - DEFEXT(EXT_unpack_subimage); - DEFEXT(EXT_shader_framebuffer_fetch); - DEFEXT(ARM_shader_framebuffer_fetch); - DEFEXT(NV_shader_framebuffer_fetch); - DEFEXT(EXT_copy_image); - DEFEXT(OES_copy_image); - DEFEXT(ARB_copy_image); - DEFEXT(NV_copy_image); - DEFEXT(EXT_clear_texture); - DEFEXT(ARB_clear_texture); - DEFEXT(QCOM_alpha_test); -#undef DEFEXT - const char *NameEnd = nullptr; -}; -static const _UsedGLExtInfo UsedGLExtInfo; - static void TVPInitGLExtensionFunc() { #ifdef _MSC_VER GL::glGetProcAddress = wglGetProcAddress; @@ -152,28 +157,34 @@ static void TVPInitGLExtensionFunc() { GL::glGetProcAddress = (GL::fGetProcAddress*)eglGetProcAddress; #endif + GL_CHECK_unpack_subimage = TVPCheckGLExtension("GL_EXT_unpack_subimage"); + GL_CHECK_shader_framebuffer_fetch = + TVPCheckGLExtension("GL_EXT_shader_framebuffer_fetch") || + TVPCheckGLExtension("GL_ARM_shader_framebuffer_fetch") || + TVPCheckGLExtension("GL_NV_shader_framebuffer_fetch"); + + if (GL::glGetProcAddress) { #ifdef _MSC_VER - GL::glGetTextureImage = (GL::fGetTextureImage*)GL::glGetProcAddress("glGetTextureImage"); -#endif -#ifdef _MSC_VER -// if (!GL::glCopyImageSubData && TVPCheckGLExtension(UsedGLExtInfo.GLEXT_EXT_copy_image)) -// GL::glCopyImageSubData = (GL::fCopyImageSubData*)GL::glGetProcAddress("glCopyImageSubDataEXT"); -// if (!GL::glCopyImageSubData && TVPCheckGLExtension(UsedGLExtInfo.GLEXT_OES_copy_image)) -// GL::glCopyImageSubData = (GL::fCopyImageSubData*)GL::glGetProcAddress("glCopyImageSubDataOES"); - if (!GL::glCopyImageSubData && TVPCheckGLExtension(UsedGLExtInfo.GLEXT_NV_copy_image)) - GL::glCopyImageSubData = (GL::fCopyImageSubData*)GL::glGetProcAddress("glCopyImageSubDataNV"); - if (!GL::glCopyImageSubData && TVPCheckGLExtension(UsedGLExtInfo.GLEXT_ARB_copy_image)) - GL::glCopyImageSubData = (GL::fCopyImageSubData*)GL::glGetProcAddress("glCopyImageSubData"); + GL::glGetTextureImage = (GL::fGetTextureImage*)GL::glGetProcAddress("glGetTextureImage"); #endif - if (!GL::glClearTexImage && TVPCheckGLExtension(UsedGLExtInfo.GLEXT_EXT_clear_texture)) { - GL::glClearTexImage = (GL::fClearTexImage*)GL::glGetProcAddress("glClearTexImageEXT"); - GL::glClearTexSubImage = (GL::fClearTexSubImage*)GL::glGetProcAddress("glClearTexSubImageEXT"); - } - if (!GL::glClearTexImage && TVPCheckGLExtension(UsedGLExtInfo.GLEXT_ARB_clear_texture)) { - GL::glClearTexImage = (GL::fClearTexImage*)GL::glGetProcAddress("glClearTexImage"); - GL::glClearTexSubImage = (GL::fClearTexSubImage*)GL::glGetProcAddress("glClearTexSubImage"); - } + if (!GL::glCopyImageSubData && TVPCheckGLExtension(UsedGLExtInfo.GLEXT_EXT_copy_image)) + GL::glCopyImageSubData = (GL::fCopyImageSubData*)GL::glGetProcAddress("glCopyImageSubDataEXT"); + if (!GL::glCopyImageSubData && TVPCheckGLExtension(UsedGLExtInfo.GLEXT_OES_copy_image)) + GL::glCopyImageSubData = (GL::fCopyImageSubData*)GL::glGetProcAddress("glCopyImageSubDataOES"); + if (!GL::glCopyImageSubData && TVPCheckGLExtension(UsedGLExtInfo.GLEXT_NV_copy_image)) + GL::glCopyImageSubData = (GL::fCopyImageSubData*)GL::glGetProcAddress("glCopyImageSubDataNV"); + if (!GL::glCopyImageSubData && TVPCheckGLExtension(UsedGLExtInfo.GLEXT_ARB_copy_image)) + GL::glCopyImageSubData = (GL::fCopyImageSubData*)GL::glGetProcAddress("glCopyImageSubData"); + if (!GL::glClearTexImage && TVPCheckGLExtension(UsedGLExtInfo.GLEXT_EXT_clear_texture)) { + GL::glClearTexImage = (GL::fClearTexImage*)GL::glGetProcAddress("glClearTexImageEXT"); + GL::glClearTexSubImage = (GL::fClearTexSubImage*)GL::glGetProcAddress("glClearTexSubImageEXT"); + } + if (!GL::glClearTexImage && TVPCheckGLExtension(UsedGLExtInfo.GLEXT_ARB_clear_texture)) { + GL::glClearTexImage = (GL::fClearTexImage*)GL::glGetProcAddress("glClearTexImage"); + GL::glClearTexSubImage = (GL::fClearTexSubImage*)GL::glGetProcAddress("glClearTexSubImage"); + } + } #ifdef GL_ALPHA_TEST GL::glAlphaFunc = glAlphaFunc; #else @@ -184,7 +195,20 @@ static void TVPInitGLExtensionFunc() { } std::string TVPGetOpenGLInfo() { - TVPInitGLExtensionInfo(); +// TVPInitGLExtensionInfo(); + std::unordered_set Extensions; + std::string gl_extensions = (const char*)glGetString(GL_EXTENSIONS); + const char *p = gl_extensions.c_str(); + for (char &c : gl_extensions) { + if (c == ' ') { + c = 0; + Extensions.emplace(p); + p = &c; + ++p; + } + } + if (*p) Extensions.emplace(p); + std::stringstream ret; ret << "Renderer : "; ret << glGetString(GL_RENDERER); ret << "\n"; ret << "Vendor : "; ret << glGetString(GL_VENDOR); ret << "\n"; @@ -193,7 +217,7 @@ std::string TVPGetOpenGLInfo() { ret << "MaxTexureSize : "; ret << int(maxTextSize); ret << "\n"; ret << LocaleConfigManager::GetInstance()->GetText("supported_opengl_extension"); for (const char *const *name = (&UsedGLExtInfo.NameBegin) + 1; *name; ++name) { - if (TVPCheckGLExtension(*name)) { + if (Extensions.find(*name) != Extensions.end()) { ret << "\n"; ret << *name; } @@ -201,6 +225,31 @@ std::string TVPGetOpenGLInfo() { return ret.str(); } +// bool TVPOnOpenGLRendererSelected(bool forceNotice) { +// if (!strstr((const char*)glGetString(GL_RENDERER), "Adreno")) { +// return false; +// } +// // TVPInitGLExtensionInfo(); +// bool ret = false; +// if (strstr((const char*)glGetString(GL_EXTENSIONS), "GL_EXT_shader_framebuffer_fetch")) { +// ret = true; +// if (forceNotice || GlobalConfigManager::GetInstance()->GetValue("noticed_adreno_issue", 0) < 1) { +// const char *btnText[2] = { +// LocaleConfigManager::GetInstance()->GetText("msgbox_ok").c_str(), +// LocaleConfigManager::GetInstance()->GetText("msgbox_nerver_notice").c_str(), +// }; +// int n = TVPShowSimpleMessageBox(LocaleConfigManager::GetInstance()->GetText +// ("issue_GL_EXT_shader_framebuffer_fetch").c_str(), "Info", forceNotice ? 1 : 2, btnText); +// if (n == 1) { +// GlobalConfigManager::GetInstance()->SetValueInt("noticed_adreno_issue", 1); +// GlobalConfigManager::GetInstance()->SaveToFile(); +// } +// } +// } +// // TVPGLExtensionInfoInited = false; +// return ret; +// } + class tTVPOGLTexture2D; struct GLVertexInfo { tTVPOGLTexture2D* tex; @@ -1129,13 +1178,13 @@ class tTVPOGLTexture2D_static : public tTVPOGLTexture2D { : tTVPOGLTexture2D(tw, th, format, mode) { _scaleW = sw; _scaleH = sh; - assert(pixel); // pixel must be exist + // assert(pixel); // pixel must be exist int pixsize = getPixelSize(); if (pitch == iw * pixsize || ((pitch & 7) == 0 && pitch - iw * pixsize < 8)) { InternalInit(pixel, iw, ih, pitch); } else if (GL_CHECK_unpack_subimage) { InternalInit(nullptr, iw, ih, 0); - Update(pixel, Format, pitch, tTVPRect(0, 0, iw, ih)); + InternalUpdate(pixel, pitch, 0, 0, iw, ih); } else { // rearrange InternalInit(nullptr, iw, ih, 0); PixelData = new unsigned char[internalW * internalH * 4]; @@ -1147,7 +1196,7 @@ class tTVPOGLTexture2D_static : public tTVPOGLTexture2D { src += pitch; dst += dstpitch; } - Update(PixelData, Format, dstpitch, tTVPRect(0, 0, internalW, internalH)); + InternalUpdate(PixelData, dstpitch, 0, 0, internalW, internalH); delete[]PixelData; PixelData = nullptr; } @@ -1155,6 +1204,10 @@ class tTVPOGLTexture2D_static : public tTVPOGLTexture2D { } virtual void Update(const void *pixel, TVPTextureFormat::e format, int pitch, const tTVPRect& rc) { + if (PixelData) { + delete[] PixelData; + PixelData = nullptr; + } InternalUpdate(pixel, pitch, rc.left, rc.top, rc.get_width(), rc.get_height()); }; @@ -1185,7 +1238,7 @@ class tTVPOGLTexture2D_mutatble : public tTVPOGLTexture2D { public: tTVPOGLTexture2D_mutatble(const void *pixel, int pitch, unsigned int w, unsigned int h, TVPTextureFormat::e format, float sw, float sh) - : tTVPOGLTexture2D(w, h, format, GL_LINEAR) + : tTVPOGLTexture2D(w, h, format == TVPTextureFormat::RGB ? TVPTextureFormat::RGBA : format, GL_LINEAR) { if (!pixel) { _scaleW = sw; _scaleH = sh; @@ -1503,8 +1556,14 @@ class tTVPOGLRenderMethod_Script : public tTVPOGLRenderMethod { } virtual void Rebuild() { - program = CombineProgram(GetVertShader(m_nTex), - CompileShader(GL_FRAGMENT_SHADER, m_strScript)); + try { + program = CombineProgram(GetVertShader(m_nTex), + CompileShader(GL_FRAGMENT_SHADER, m_strScript)); + } catch (eTJSError &e) { + e.AppendMessage("\n"); + e.AppendMessage(Name); + throw; + } cocos2d::GL::useProgram(program); std::string tex("tex"); std::string coord("a_texCoord"); @@ -1626,7 +1685,7 @@ class tTVPOGLRenderMethod_Script_BlendColor : public tTVPOGLRenderMethod_Script } return inherit::EnumParameterID(name); } - virtual void SetParameterOpa(int id, int Value) { + virtual void SetParameterOpa(int id, int Value) override { if (id == 0x709AC167) { float v = Value / 255.f; cocos2d::GL::useProgram(program); @@ -1635,6 +1694,15 @@ class tTVPOGLRenderMethod_Script_BlendColor : public tTVPOGLRenderMethod_Script inherit::SetParameterOpa(id, Value); } }; + virtual void SetParameterFloat(int id, float Value) override { + if (id == 0x709AC167) { + float v = Value; + cocos2d::GL::useProgram(program); + glBlendColor(v, v, v, v); + } else { + inherit::SetParameterFloat(id, Value); + } + }; }; class tTVPOGLRenderMethod_AdjustGamma : public tTVPOGLRenderMethod_Script { @@ -1813,6 +1881,10 @@ const void * tTVPOGLTexture2D::GetScanLineForRead(tjs_uint l) if (_scaleW == 1.f && _scaleH == 1.f) { if (!PixelData) { PixelData = new unsigned char[internalW * internalH * 4]; +#ifdef _MSC_VER + GL::glGetTextureImage(texture, 0, GL_RGBA, GL_UNSIGNED_BYTE, internalH * internalW * 4, PixelData); + return &PixelData[l * internalW * 4]; +#endif TVPSetRenderTarget(texture); glViewport(0, 0, internalW, internalH); glPixelStorei(GL_PACK_ALIGNMENT, 4); // always dword aligned @@ -1870,6 +1942,8 @@ class TVPRenderManager_OpenGL : public iTVPRenderManager { "#define gl_LastFragColor gl_LastFragData[0]" "\n" "#elif defined(GL_NV_shader_framebuffer_fetch)" "\n" "#extension GL_NV_shader_framebuffer_fetch : require" "\n" + "#else" "\n" + "#error noy any framebuffer fetch extension available" "\n" "#endif" "\n" ; @@ -1982,7 +2056,7 @@ class TVPRenderManager_OpenGL : public iTVPRenderManager { } } } - if (isOpaque) { + if (fmt == TVPTextureFormat::RGBA && isOpaque) { int dpitch = (w * 3 + 7) & ~7; tjs_uint8 *rgb = new tjs_uint8[dpitch * h + 16]; tjs_uint8 *dst = (tjs_uint8*)(((intptr_t)rgb + 7) & ~7); @@ -2095,7 +2169,7 @@ class TVPRenderManager_OpenGL : public iTVPRenderManager { // test //_CreateStaticTexture2D = CreateStaticTexture2D_half; //_CreateMutableTexture2D = CreateMutableTexture2D_half; - std::string compTexMethod = IndividualConfigManager::GetInstance()->GetValueString("ogl_compress_tex", "none"); + std::string compTexMethod = IndividualConfigManager::GetInstance()->GetValue("ogl_compress_tex", "none"); if (compTexMethod == "half") { _CreateStaticTexture2D = CreateStaticTexture2D_half; // _CreateMutableTexture2D = CreateMutableTexture2D_half; @@ -2103,7 +2177,7 @@ class TVPRenderManager_OpenGL : public iTVPRenderManager { GLint maxTextSize; glGetIntegerv(GL_MAX_TEXTURE_SIZE, &maxTextSize); TVPMaxTextureSize = maxTextSize; - maxTextSize = IndividualConfigManager::GetInstance()->GetValueInt("ogl_max_texsize", 0); + maxTextSize = IndividualConfigManager::GetInstance()->GetValue("ogl_max_texsize", 0); if (maxTextSize > 0 && (maxTextSize < TVPMaxTextureSize || TVPMaxTextureSize < 1024)) { TVPMaxTextureSize = maxTextSize; // override by user config } @@ -2475,17 +2549,29 @@ class TVPRenderManager_OpenGL : public iTVPRenderManager { "}", 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ZERO, GL_ONE); TEST_SHADER_BLTO(AlphaBlend); + CompileAndRegScript("AlphaBlend_color", colorPrefix + ScriptCommonPrefix + + " s *= color;\n" + " gl_FragColor = s;\n" + "}", 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ZERO, GL_ONE); + if (GL::glAlphaFunc) { - CompileAndRegScript("AlphaBlend_AlphaTest", opacityPrefix + ScriptCommonPrefix + - " s.a *= opacity;\n" + CompileAndRegScript("AlphaBlend_color_AlphaTest", colorPrefix + ScriptCommonPrefix + + " s *= color;\n" " gl_FragColor = s;\n" "}", 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ZERO, GL_ONE); + CompileAndRegScript("AlphaTest", ScriptCommonPrefix + + " gl_FragColor = s;\n" + "}", 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_ZERO, GL_ONE, GL_ZERO, GL_ONE); } else { - CompileAndRegScript("AlphaBlend_AlphaTest", opacityPrefix + alpha_thresholdPrefix + ScriptCommonPrefix + - " s.a *= opacity;\n" + CompileAndRegScript("AlphaBlend_color_AlphaTest", colorPrefix + alpha_thresholdPrefix + ScriptCommonPrefix + + " s *= color;\n" " if(s.a < alpha_threshold) discard;\n" " else gl_FragColor = s;\n" "}", 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ZERO, GL_ONE); + CompileAndRegScript("AlphaTest", alpha_thresholdPrefix + ScriptCommonPrefix + + " if(s.a < alpha_threshold) discard;\n" + " gl_FragColor = s;\n" + "}", 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_ZERO, GL_ONE, GL_ZERO, GL_ONE); } CompileAndRegScript("AlphaBlend_SD", /*opacityPrefix +*/ ScriptCommonPrefix + @@ -2515,14 +2601,19 @@ class TVPRenderManager_OpenGL : public iTVPRenderManager { TEST_SHADER(AlphaBlend_a, TVPAlphaBlend_ao(testdest, testdata1, 256 * 256, TEST_SHADER_OPA)); + CompileAndRegScript("AlphaBlend_color_a", colorPrefix + ScriptCommonPrefix + + " s *= color;\n" + " gl_FragColor = s;\n" + "}", 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA); + if (GL::glAlphaFunc) { - CompileAndRegScript("AlphaBlend_a_AlphaTest", opacityPrefix + ScriptCommonPrefix + - " s.a *= opacity;\n" + CompileAndRegScript("AlphaBlend_color_a_AlphaTest", colorPrefix + ScriptCommonPrefix + + " s *= color;\n" " gl_FragColor = s;\n" "}", 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA); } else { - CompileAndRegScript("AlphaBlend_a_AlphaTest", opacityPrefix + alpha_thresholdPrefix + ScriptCommonPrefix + - " s.a *= opacity;\n" + CompileAndRegScript("AlphaBlend_color_a_AlphaTest", colorPrefix + alpha_thresholdPrefix + ScriptCommonPrefix + + " s *= color;\n" " if(s.a < alpha_threshold) discard;\n" " gl_FragColor = s;\n" "}", 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA); @@ -2538,11 +2629,18 @@ class TVPRenderManager_OpenGL : public iTVPRenderManager { TEST_SHADER(AlphaBlend_d, TVPAlphaBlend_do(testdest, testdata1, 256 * 256, TEST_SHADER_OPA)); + shader_AlphaBlend_d = + " s *= color;\n" + " d.a = s.a + d.a - s.a * d.a;\n" + " d.rgb = mix(d.rgb, s.rgb, s.a / (d.a + 0.0001));\n" + " gl_FragColor = d;\n" + "}"; + CompileAndRegRegularBlendMethod("AlphaBlend_color_d", colorPrefix, shader_AlphaBlend_d); if (GL::glAlphaFunc) { - CompileAndRegRegularBlendMethod("AlphaBlend_d_AlphaTest", opacityPrefix, shader_AlphaBlend_d); + CompileAndRegRegularBlendMethod("AlphaBlend_color_d_AlphaTest", colorPrefix, shader_AlphaBlend_d); } else { - CompileAndRegRegularBlendMethod("AlphaBlend_d_AlphaTest", opacityPrefix + alpha_thresholdPrefix, - " s.a *= opacity;\n" + CompileAndRegRegularBlendMethod("AlphaBlend_color_d_AlphaTest", colorPrefix + alpha_thresholdPrefix, + " s *= color;\n" " if(s.a < alpha_threshold) discard;\n" " d.a = s.a + d.a - s.a * d.a;\n" " d.rgb = mix(d.rgb, s.rgb, s.a / (d.a + 0.0001));\n" @@ -2625,17 +2723,21 @@ class TVPRenderManager_OpenGL : public iTVPRenderManager { CompileAndRegRegularBlendMethod("PsAddBlend", opacityPrefix, shader_PsAddBlend); TEST_SHADER_BLTO(PsAddBlend); + shader_PsAddBlend = + " s *= color;\n" + " d.rgb = mix(d.rgb, clamp(s.rgb + d.rgb, 0.0, 1.0), s.a);\n" + " gl_FragColor = d;\n" + "}"; + CompileAndRegRegularBlendMethod("PsAddBlend_color", colorPrefix, shader_PsAddBlend); if (GL::glAlphaFunc) { - CompileAndRegRegularBlendMethod("PsAddBlend_AlphaTest", opacityPrefix, + CompileAndRegRegularBlendMethod("PsAddBlend_color_AlphaTest", colorPrefix, shader_PsAddBlend); } else { - CompileAndRegRegularBlendMethod("PsAddBlend_AlphaTest", opacityPrefix + alpha_thresholdPrefix, - " s.a *= opacity;\n" + CompileAndRegRegularBlendMethod("PsAddBlend_color_AlphaTest", colorPrefix + alpha_thresholdPrefix, + " s *= color;\n" " if(s.a < alpha_threshold) discard;\n" - " else {\n" - " d.rgb = mix(d.rgb, clamp(s.rgb + d.rgb, 0.0, 1.0), s.a);\n" - " gl_FragColor = d;\n" - " }\n" + " d.rgb = mix(d.rgb, clamp(s.rgb + d.rgb, 0.0, 1.0), s.a);\n" + " gl_FragColor = d;\n" "}"); } @@ -2646,16 +2748,21 @@ class TVPRenderManager_OpenGL : public iTVPRenderManager { CompileAndRegRegularBlendMethod("PsSubBlend", opacityPrefix, shader_PsSubBlend); TEST_SHADER_BLTO(PsSubBlend); + shader_PsSubBlend = + " s *= color;\n" + " d.rgb = mix(d.rgb, clamp(d.rgb + s.rgb - 1.0, 0.0, 1.0), s.a);\n" + " gl_FragColor = d;\n" + "}"; + CompileAndRegRegularBlendMethod("PsSubBlend_color", colorPrefix, shader_PsSubBlend); if (GL::glAlphaFunc) { - CompileAndRegRegularBlendMethod("PsSubBlend_AlphaTest", opacityPrefix, + CompileAndRegRegularBlendMethod("PsSubBlend_color_AlphaTest", colorPrefix, shader_PsSubBlend); } else { - CompileAndRegRegularBlendMethod("PsSubBlend_AlphaTest", opacityPrefix + alpha_thresholdPrefix, - " s.a *= opacity;\n" + CompileAndRegRegularBlendMethod("PsSubBlend_color_AlphaTest", colorPrefix + alpha_thresholdPrefix, + " s *= color;\n" " if(s.a < alpha_threshold) discard;\n" - " else {\n" " d.rgb = mix(d.rgb, clamp(d.rgb + s.rgb - 1.0, 0.0, 1.0), s.a);\n" - " gl_FragColor = d;\n }\n" + " gl_FragColor = d;\n" "}"); } @@ -2669,17 +2776,24 @@ class TVPRenderManager_OpenGL : public iTVPRenderManager { shader_PsMulBlend, 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_ZERO, GL_SRC_COLOR, GL_ZERO, GL_ONE); TEST_SHADER_BLTO(PsMulBlend); + shader_PsMulBlend = + " s *= color;\n" + " s.rgb *= s.a;\n" + " s.rgb += 1.0 - s.a;\n" + " gl_FragColor = s;\n" + "}"; + CompileAndRegScript("PsMulBlend_color", colorPrefix + ScriptCommonPrefix + + shader_PsMulBlend, 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_ZERO, GL_SRC_COLOR, GL_ZERO, GL_ONE); if (GL::glAlphaFunc) { - CompileAndRegScript("PsMulBlend_AlphaTest", opacityPrefix + ScriptCommonPrefix + + CompileAndRegScript("PsMulBlend_color_AlphaTest", colorPrefix + ScriptCommonPrefix + shader_PsMulBlend, 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_ZERO, GL_SRC_COLOR, GL_ZERO, GL_ONE); } else { - CompileAndRegScript("PsMulBlend_AlphaTest", opacityPrefix + alpha_thresholdPrefix + ScriptCommonPrefix + - " s.a *= opacity;\n" + CompileAndRegScript("PsMulBlend_color_AlphaTest", colorPrefix + alpha_thresholdPrefix + ScriptCommonPrefix + + " s *= color;\n" " if(s.a < alpha_threshold) discard;\n" - " else {\n" " s.rgb *= s.a;\n" " s.rgb += 1.0 - s.a;\n" - " gl_FragColor = s;\n }\n" + " gl_FragColor = s;\n" "}", 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_ZERO, GL_SRC_COLOR, GL_ZERO, GL_ONE); } @@ -2771,16 +2885,22 @@ class TVPRenderManager_OpenGL : public iTVPRenderManager { shader_PsScreenBlend, 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_ONE_MINUS_DST_COLOR, GL_ONE, GL_ZERO, GL_ONE); TEST_SHADER_BLTO(PsScreenBlend); + shader_PsScreenBlend = + " s *= color;\n" + " s.rgb *= s.a;\n" + " gl_FragColor = s;\n" + "}"; + CompileAndRegScript("PsScreenBlend_color", colorPrefix + ScriptCommonPrefix + + shader_PsScreenBlend, 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_ONE_MINUS_DST_COLOR, GL_ONE, GL_ZERO, GL_ONE); if (GL::glAlphaFunc) { - CompileAndRegScript("PsScreenBlend_AlphaTest", opacityPrefix + ScriptCommonPrefix + + CompileAndRegScript("PsScreenBlend_color_AlphaTest", colorPrefix + ScriptCommonPrefix + shader_PsScreenBlend, 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_ONE_MINUS_DST_COLOR, GL_ONE, GL_ZERO, GL_ONE); } else { - CompileAndRegScript("PsScreenBlend_AlphaTest", opacityPrefix + alpha_thresholdPrefix + ScriptCommonPrefix + - " s.a *= opacity;\n" + CompileAndRegScript("PsScreenBlend_color_AlphaTest", colorPrefix + alpha_thresholdPrefix + ScriptCommonPrefix + + " s *= color;\n" " if(s.a < alpha_threshold) discard;\n" - " else {\n" " s.rgb *= s.a;\n" - " gl_FragColor = s;\n }\n" + " gl_FragColor = s;\n" "}", 1)->SetBlendFuncSeparate(GL_FUNC_ADD, GL_ONE_MINUS_DST_COLOR, GL_ONE, GL_ZERO, GL_ONE); } // AdditiveAlpha <-> Alpha @@ -2850,6 +2970,54 @@ class TVPRenderManager_OpenGL : public iTVPRenderManager { " gl_FragColor = s / vec4(9, 9, 9, 9);\n" "}", 1)/*->SetTargetAsSrc()*/; //TEST_SHADER(BoxBlur, TVPDoGrayScale(testdest, 256 * 256)); let it pass +#if 0 + // GL_EXT_shader_framebuffer_fetch issue in some adreno GPUs + if (TVPCheckGLExtension("GL_EXT_shader_framebuffer_fetch")) { + tTVPBitmap *testbmp1 = new tTVPBitmap(256, 256, 32); + tTVPBitmap *testbmp2 = new tTVPBitmap(256, 256, 32); + for (int y = 0; y < 256; ++y) { + uint8_t *pix1 = (uint8_t *)testbmp1->GetScanLine(y); + uint8_t *pix2 = (uint8_t *)testbmp2->GetScanLine(y); + for (int x = 0; x < 256; ++x) { + pix2[2] = pix1[0] = 255 - x; + pix2[3] = pix1[1] = x; + pix2[1] = pix1[2] = y; + pix2[0] = pix1[3] = 255 - y; + pix1 += 4; + pix2 += 4; + } + } + tTVPRect rc(0, 0, 256, 256); + iTVPTexture2D *testtex1 = CreateTexture2D(nullptr, 0, 256, 256, TVPTextureFormat::RGBA, 0); + testtex1->Update(testbmp1->GetScanLine(0), TVPTextureFormat::RGBA, testbmp1->GetPitch(), rc); + iTVPTexture2D *testtex2 = CreateTexture2D(testbmp2); + // test GL_EXT_shader_framebuffer_fetch + TVPPsAddBlend_HDA((tjs_uint32*)testbmp1->GetScanLine(0), (tjs_uint32*)testbmp2->GetScanLine(0), + testbmp1->GetPitch() * 256 / 4); + + tTVPOGLRenderMethod_Script *method = (tTVPOGLRenderMethod_Script *)GetRenderMethod("PsAddBlend"); + method->SetParameterOpa(method->EnumParameterID("opacity"), 255); + std::vector src_tex; + src_tex.emplace_back(testtex2, rc); + OperateRect(method, testtex1, testtex1, rc, tRenderTexRectArray(&src_tex[0], src_tex.size())); + + uint8_t *pix1 = (uint8_t *)testbmp1->GetScanLine(0); + uint8_t *pix2 = (uint8_t *)testtex1->GetScanLineForRead(0); + for (int i = 0; i < 256 * 256 * 4; ++i) { + if (std::abs(pix1[i] - pix2[i]) > 2) { + const char *btnText = "OK"; + TVPShowSimpleMessageBox(LocaleConfigManager::GetInstance()->GetText + ("issue_GL_EXT_shader_framebuffer_fetch").c_str(), "Info", 1, &btnText); + break; + } + } + + delete testbmp1; + delete testbmp2; + delete testtex1; + delete testtex2; + } +#endif } tTVPOGLTexture2D *tempTexture; @@ -2883,7 +3051,7 @@ class TVPRenderManager_OpenGL : public iTVPRenderManager { } void CopyTexture(tTVPOGLTexture2D *dst, tTVPOGLTexture2D *src, const tTVPRect &rcsrc) { - while (GL::glCopyImageSubData && + if (GL::glCopyImageSubData && src->_scaleW == dst->_scaleW && src->_scaleH == dst->_scaleH && src->Format == dst->Format) { tTVPRect rc; @@ -3211,7 +3379,11 @@ class TVPRenderManager_OpenGL : public iTVPRenderManager { GL::glGetTextureImage(texlist[i].tex->texture, 0, GL_BGRA, GL_UNSIGNED_BYTE, texlist[i].tex->internalH * texlist[i].tex->internalW * 4, _src[i]->ptr(0, 0)); } cv::Mat _tar(tar->internalH, tar->internalW, CV_8UC4); + cv::Mat _stencil(tar->internalH, tar->internalW, CV_8U); GL::glGetTextureImage(tar->texture, 0, GL_BGRA, GL_UNSIGNED_BYTE, tar->internalH * tar->internalW * 4, _tar.ptr(0, 0)); + if (glIsEnabled(GL_STENCIL_TEST)) { + glReadPixels(0, 0, tar->internalW, tar->internalH, GL_STENCIL_INDEX, GL_UNSIGNED_BYTE, _stencil.ptr(0, 0)); + } tar = tar; for (unsigned int i = 0; i < texlist.size(); ++i) { delete _src[i]; diff --git a/src/plugins/dirlist.cpp b/src/plugins/dirlist.cpp index 6bd9baf8..a1b3c195 100644 --- a/src/plugins/dirlist.cpp +++ b/src/plugins/dirlist.cpp @@ -36,20 +36,14 @@ class tGetDirListFunction : public tTJSDispatch iTJSDispatch2 * array = TJSCreateArrayObject(); if (!result) return TJS_S_OK; try { - class tLister : public iTVPStorageLister - { - public: - tTJSArrayNI* ni; - tLister(iTJSDispatch2 * arr) { - arr->NativeInstanceSupport(TJS_NIS_GETINSTANCE, TJSGetArrayClassID(), (iTJSNativeInstance**)&ni); - } - void TJS_INTF_METHOD Add(const ttstr &file) - { - ni->Items.emplace_back(file); - } - } lister(array); + tTJSArrayNI* ni; + array->NativeInstanceSupport(TJS_NIS_GETINSTANCE, TJSGetArrayClassID(), (iTJSNativeInstance**)&ni); TVPGetLocalName(dir); - TVPGetLocalFileListAt(dir, &lister, S_IFMT); + TVPGetLocalFileListAt(dir, [ni](const ttstr &name, tTVPLocalFileInfo* s) { + if (s->Mode & (S_IFREG | S_IFDIR)) { + ni->Items.emplace_back(name); + } + }); *result = tTJSVariant(array, array); array->Release(); }