Skip to content

Commit

Permalink
WebParser: Added "DecodeCodePoints" option (default is 0)
Browse files Browse the repository at this point in the history
This will decode any hardcoded "\uXXXX" or "\UXXXXXXXX" unicode code points returned from webparser (just like DecodeCharacterReferences).
Note: DecodeCodePoints=1 should always mean "Decode ALL code points" in case other types of escaped code point decoding is added later (like octal).
  • Loading branch information
brianferguson committed Jul 31, 2020
1 parent d03f16c commit fc26a30
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 4 deletions.
29 changes: 28 additions & 1 deletion Common/CharacterEntityReference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,13 +303,15 @@ WCHAR GetEntityChar(const std::wstring& entity)

} // namespace

void Decode(std::wstring& str, int opt)
void Decode(std::wstring& str, int opt, bool unescape)
{
// (opt <= 0 || opt > 3) : Do nothing.
// (opt == 1) : Decode both numeric character references and character entity references.
// (opt == 2) : Decode only numeric character references.
// (opt == 3) : Decode only character entity references.

// (unescape == true) : Unescape any \uXXXX or \UXXXXXXXX unicode code points.

if (opt >= 1 && opt <= 3)
{
std::wstring::size_type start = 0;
Expand Down Expand Up @@ -387,6 +389,31 @@ void Decode(std::wstring& str, int opt)
}
}
}

if (unescape)
{
size_t len = 0;
size_t pos = -1;
while ((pos = str.find(L'\\', ++pos)) != std::wstring::npos)
{
switch (str[pos + 1])
{
case L'u': len = 4; break;
case L'U': len = 8; break;
default: continue;
}

std::wstring num(str, pos + 2, len);
WCHAR* pch = nullptr;
errno = 0;
long ch = wcstol(num.c_str(), &pch, 16); // code points use hexidecimal format
if (pch == nullptr || *pch != L'\0' || errno == ERANGE || ch <= 0 || ch >= 0xFFFE) // invalid character
{
continue;
}
str.replace(pos, len + 2, 1, (WCHAR)ch);
}
}
}

} // namespace CharacterEntityReference
2 changes: 1 addition & 1 deletion Common/CharacterEntityReference.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

namespace CharacterEntityReference {

void Decode(std::wstring& str, int opt);
void Decode(std::wstring& str, int opt, bool unescape);

} // namespace CharacterEntityReference

Expand Down
7 changes: 5 additions & 2 deletions Library/MeasureWebParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ MeasureWebParser::MeasureWebParser(Skin* skin, const WCHAR* name) : Measure(skin
m_StringIndex(),
m_StringIndex2(),
m_DecodeCharacterReference(),
m_DecodeCodePoints(false),
m_Debug(),
m_LogSubstringErrors(),
m_UpdateRate(),
Expand Down Expand Up @@ -372,6 +373,8 @@ void MeasureWebParser::ReadOptions(ConfigParser& parser, const WCHAR* section)
m_StringIndex2 = index < 0 ? 0 : index;

m_DecodeCharacterReference = parser.ReadInt(section, L"DecodeCharacterReference", 0);
m_DecodeCodePoints = parser.ReadBool(section, L"DecodeCodePoints", false);

m_UpdateRate = parser.ReadInt(section, L"UpdateRate", 600);
m_Codepage = parser.ReadInt(section, L"CodePage", 0);
if (m_Codepage == 0)
Expand Down Expand Up @@ -668,7 +671,7 @@ void MeasureWebParser::ParseData(const BYTE* rawData, DWORD rawSize, bool utf16D
int matchLen = ovector[2 * m_StringIndex + 1] - ovector[2 * m_StringIndex];
EnterCriticalSection(&g_CriticalSection);
m_ResultString.assign(match, matchLen);
CharacterEntityReference::Decode(m_ResultString, m_DecodeCharacterReference);
CharacterEntityReference::Decode(m_ResultString, m_DecodeCharacterReference, m_DecodeCodePoints);
LeaveCriticalSection(&g_CriticalSection);
}
else
Expand Down Expand Up @@ -725,7 +728,7 @@ void MeasureWebParser::ParseData(const BYTE* rawData, DWORD rawSize, bool utf16D
(*i)->m_ResultString.replace(
StringUtil::CaseInsensitiveFind((*i)->m_ResultString, compareStr),
compareStr.size(), match, matchLen);
CharacterEntityReference::Decode((*i)->m_ResultString, (*i)->m_DecodeCharacterReference);
CharacterEntityReference::Decode((*i)->m_ResultString, (*i)->m_DecodeCharacterReference, (*i)->m_DecodeCodePoints);

// Start download threads for the references
if ((*i)->m_Download)
Expand Down
1 change: 1 addition & 0 deletions Library/MeasureWebParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class MeasureWebParser : public Measure
int m_StringIndex;
int m_StringIndex2;
int m_DecodeCharacterReference;
bool m_DecodeCodePoints;
int m_Debug;
UINT m_UpdateRate;
UINT m_UpdateCounter;
Expand Down

0 comments on commit fc26a30

Please sign in to comment.