From e512a66bb07ddc1e62b1feaa76f36e340bec9f5d Mon Sep 17 00:00:00 2001 From: mdecimus Date: Fri, 30 Aug 2024 17:04:00 +0200 Subject: [PATCH] More flexible charset name parsing (closes #85) --- src/decoders/charsets/map.rs | 1091 +++++++++++----------------------- 1 file changed, 332 insertions(+), 759 deletions(-) diff --git a/src/decoders/charsets/map.rs b/src/decoders/charsets/map.rs index 9b8e274..d24b0ba 100644 --- a/src/decoders/charsets/map.rs +++ b/src/decoders/charsets/map.rs @@ -12,33 +12,34 @@ use super::{ multi_byte::*, single_byte::*, - utf::{decoder_utf16, decoder_utf16_be, decoder_utf16_le, decoder_utf7, decoder_utf8}, + utf::{decoder_utf16, decoder_utf16_be, decoder_utf16_le, decoder_utf7}, DecoderFnc, }; pub fn charset_decoder(charset: &[u8]) -> Option { if (2..=45).contains(&charset.len()) { let mut l_charset = [0u8; 45]; - let mut hash: u32 = charset.len() as u32; + let mut hash = charset.len(); + let mut ch = 0; - for (pos, ch) in charset.iter().enumerate() { - let ch = if ch.is_ascii_uppercase() { - *ch + 32 - } else { - *ch + for (pos, ch_) in charset.iter().enumerate() { + ch = match ch_ { + b'A'..=b'Z' => *ch_ + 32, + b'-' => b'_', + _ => *ch_, }; - l_charset[pos] = ch; - if let 0 | 3 | 6 | 7 | 8 | 9 = pos { - hash += { - #[cfg(feature = "ludicrous_mode")] - unsafe { - *CH_HASH.get_unchecked(ch as usize) - } - #[cfg(not(feature = "ludicrous_mode"))] - CH_HASH[ch as usize] - }; + + #[cfg(feature = "ludicrous_mode")] + unsafe { + *l_charset.get_unchecked_mut(pos) = ch; } - if pos == charset.len() - 1 { + + #[cfg(not(feature = "ludicrous_mode"))] + { + l_charset[pos] = ch; + } + + if let 0 | 2 | 6 | 7 | 8 | 9 = pos { hash += { #[cfg(feature = "ludicrous_mode")] unsafe { @@ -46,12 +47,20 @@ pub fn charset_decoder(charset: &[u8]) -> Option { } #[cfg(not(feature = "ludicrous_mode"))] CH_HASH[ch as usize] - }; + } as usize; } } - if (7..=764).contains(&hash) { - let hash = (hash - 7) as usize; + hash += { + #[cfg(feature = "ludicrous_mode")] + unsafe { + *CH_HASH.get_unchecked(ch as usize) + } + #[cfg(not(feature = "ludicrous_mode"))] + CH_HASH[ch as usize] + } as usize; + + if hash <= 544 { let ch_map = { #[cfg(feature = "ludicrous_mode")] unsafe { @@ -78,415 +87,264 @@ pub fn charset_decoder(charset: &[u8]) -> Option { } pub fn no_op(_bytes: &[u8]) -> String { - "".to_string() + String::new() } // Perfect hashing table for charset names -static CH_HASH: &[u32] = &[ - 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, - 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, - 765, 765, 765, 765, 765, 765, 765, 0, 55, 765, 125, 5, 90, 155, 35, 15, 45, 140, 0, 30, 765, - 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, - 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 15, 765, - 5, 0, 225, 15, 35, 0, 135, 115, 0, 5, 15, 5, 0, 20, 0, 30, 765, 0, 5, 10, 10, 765, 5, 765, 765, - 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, - 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, - 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, - 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, - 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, - 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, - 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, 765, - 765, +static CH_HASH: &[u16] = &[ + 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, + 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, + 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 50, 0, 80, 55, 15, 25, 40, 100, 5, 35, 545, + 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, + 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 0, 545, + 135, 10, 0, 5, 15, 35, 10, 65, 30, 0, 5, 0, 230, 0, 0, 70, 545, 55, 5, 0, 175, 545, 55, 545, + 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, + 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, + 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, + 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, + 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, + 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, + 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, 545, + 545, 545, ]; -static CH_MAP: &[&[u8]; 758] = &[ - b"l8", +static CH_MAP: &[&[u8]; 545] = &[ b"", b"", - b"", - b"latin8", b"l1", b"", b"", - b"", //b"utf-8", - b"latin1", - b"", //b"us", - b"", - b"", - b"", - b"koi8-r", - b"l5", - b"", //b"us-ascii", - b"", - b"", - b"latin5", b"", - b"ms_kanji", - b"shift_jis", + b"latin1", + b"l8", b"", - b"koi8-u", b"", b"", - b"big5", + b"latin8", b"", - b"ibm819", b"", b"", b"", b"", - b"euc-kr", b"l4", b"", b"", - b"", + b"csgbk", b"latin4", b"", - b"866", - b"", - b"iso-ir-148", - b"ibm866", - b"l6", - b"ecma-118", - b"", - b"iso-8859-8", - b"latin6", - b"", - b"", - b"", - b"", - b"utf-16", - b"", - b"", - b"", - b"iso-8859-1", - b"iso-8859-11", - b"", - b"", - b"", - b"iso_8859-8", - b"euc-jp", - b"latin-9", - b"", - b"", //b"iso646-us", - b"iso_8859-8:1988", - b"iso-8859-15", - b"", - b"", - b"", - b"iso_8859-1", - b"iso_8859-14:1998", - b"", - b"", - b"", - b"iso-8859-5", - b"iso_8859-16:2001", - b"", - b"utf-16be", - b"", - b"iso_8859-5:1988", - b"iso_8859-15", - b"", - b"utf-16le", - b"", - b"", - b"iso-8859-14", - b"l2", - b"", //b"iso-ir-6", - b"", - b"iso_8859-5", - b"latin2", - b"", - b"", - b"", - b"iso-ir-199", - b"iso-8859-16", - b"", - b"", - b"", - b"iso_8859-4:1988", - b"iso_8859-14", - b"", - b"", - b"", - b"iso-8859-9", - b"", - b"", + b"gbk", b"", b"", - b"iso-ir-144", - b"iso_8859-16", b"", - b"ecma-114", + b"l5", b"", - b"iso-8859-4", - b"hebrew", b"", - b"850", b"", - b"iso_8859-9", - b"ibm850", - b"windows-1258", - b"l10", + b"latin5", b"", - b"iso_8859-9:1989", - b"", //b"iso_646.irv:1991", - b"windows-1251", - b"asmo-708", b"", - b"iso_8859-4", b"", + b"greek", + b"greek8", b"", - b"elot_928", b"", - b"iso-8859-6", b"", - b"windows-1255", b"", + b"csbig5", + b"l6", b"", - b"iso-ir-101", b"", + b"cp819", + b"latin6", b"", - b"gbk", b"", - b"utf-7", + b"big5", + b"cp866", b"", b"", b"", b"", - b"iso_8859-6", + b"csshiftjis", b"", b"l3", b"", b"", - b"", + b"cp850", b"latin3", - b"windows-1254", - b"", b"", - b"iso-ir-138", - b"iso_8859-10:1992", b"", b"", - b"", - b"", - b"greek8", - b"windows-1256", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"iso-8859-10", - b"", - b"", - b"", - b"greek", - b"", + b"iso_ir_148", b"", b"", b"", - b"iso-ir-126", b"", + b"iso_celtic", b"", b"", b"", - b"iso-ir-109", b"", b"", + b"euc_kr", + b"latin_9", b"", b"", - b"ms936", - b"", + b"cp936", b"", + b"l2", b"", + b"shift_jis", + b"iso_ir_144", + b"latin2", b"", + b"866", b"", - b"windows-874", + b"iso_ir_101", + b"euc_jp", b"", + b"cyrillic", b"", b"", + b"koi8_r", b"", - b"iso-8859-13", b"", b"", + b"iso_8859_1", + b"iso_8859_11", b"", - b"iso_8859-1:1987", + b"l10", b"", - b"windows-1252", + b"iso_ir_138", + b"iso_8859_16:2001", + b"latin10", + b"850", b"", + b"iso_8859_8", + b"iso_8859_14:1998", + b"tis_620", + b"elot_928", b"", - b"extended_unix_code_packed_format_for_japanese", - b"iso-2022-jp", + b"iso_8859_8:1988", + b"iso_8859_14", + b"gb18030", b"", - b"mac", b"", - b"iso_8859-3:1988", b"", + b"csiso885914", b"cskoi8r", b"", b"", - b"iso-8859-2", - b"arabic", - b"", //b"csascii", - b"", + b"iso_8859_4:1988", + b"iso_8859_15", b"", + b"cswindows1251", b"", - b"", //b"csutf8", + b"iso_8859_4", + b"csiso885915", b"cseuckr", - b"", - b"macintosh", - b"csgbk", - b"csbig5", - b"", - b"", - b"", - b"iso_8859-2", - b"", - b"cskoi8u", b"cswindows1258", b"", - b"", - b"", - b"windows-1250", - b"cswindows1251", - b"", - b"iso_8859-6:1987", - b"", - b"latin10", - b"", - b"", //b"ansi_x3.4-1968", - b"cp819", - b"windows-936", - b"tis-620", - b"cswindows1255", - b"", - b"iso-ir-110", - b"", - b"windows-1257", - b"", - b"", - b"", + b"iso_8859_5:1988", + b"hebrew", b"", b"", b"", + b"iso_ir_110", + b"iso_8859_16", b"", - b"iso-ir-226", - b"csisolatin1", + b"cswindows1254", + b"ks_c_5601_1989", + b"iso_ir_199", + b"csiso885916", b"cswindows874", b"", b"", - b"", - b"csisolatinhebrew", - b"windows-1253", - b"cswindows1254", + b"iso_8859_5", + b"iso_8859_10", + b"windows_1251", + b"cswindows1255", b"", b"", - b"csisolatin5", + b"iso_8859_13", + b"windows_1258", + b"csibm866", b"", + b"iso_ir_109", + b"csiso885913", b"", b"", - b"csisolatingreek", b"", + b"iso_8859_3:1988", b"", + b"windows_1254", b"cswindows1256", b"", - b"", - b"", //b"ibm367", - b"", - b"", - b"", - b"iso_8859-2:1987", - b"csiso885915", - b"", - b"", - b"", //b"ansi_x3.4-1986", - b"iso-ir-157", - b"csisolatin4", - b"", + b"iso_8859_9", b"", b"", b"", b"", + b"iso_8859_9:1989", + b"gb2312", + b"windows_1255", + b"cswindows1250", b"", - b"", - b"cseucpkdfmtjapanese", - b"cp866", - b"csisolatin6", - b"", - b"", + b"iso_8859_6", b"", b"", - b"csiso885914", + b"cswindows1253", b"", b"", + b"iso_8859_10:1992", b"", + b"cstis620", b"", + b"iso_ir_100", + b"windows_874", + b"windows_1256", b"", - b"csutf16", b"", b"", - b"iso-8859-7", - b"csiso885916", b"", b"", b"", b"", + b"iso_ir_126", + b"windows_936", + b"windows_1250", b"", b"", + b"iso_8859_1:1987", + b"csisolatin1", + b"windows_1253", b"cswindows1252", + b"ks_c_5601_1987", + b"iso_8859_3", + b"csisolatinarabic", b"", + b"csisolatincyrillic", b"", + b"csisolatingreek", + b"koi8_u", b"", b"", b"", b"", - b"iso_8859-7", - b"", - b"", - b"", - b"", - b"iso_8859-7:1987", - b"", - b"", - b"", - b"csutf16be", - b"", - b"", - b"", - b"", - b"csutf16le", - b"iso-8859-3", - b"csisolatin2", - b"", - b"csibm866", - b"", - b"cp850", - b"", - b"", - b"", - b"", - b"csshiftjis", + b"csisolatin4", b"", b"", - b"cswindows1250", + b"csgb18030", b"", - b"iso_8859-3", - b"csutf7", b"", b"", + b"cswindows1257", b"", - b"iso-ir-127", - b"gb2312", b"", + b"csisolatin5", + b"windows_1252", + b"mac", b"", b"", - b"iso-ir-100", - b"csmacintosh", - b"gb18030", - b"cswindows1257", b"", b"", b"", @@ -496,63 +354,91 @@ static CH_MAP: &[&[u8]; 758] = &[ b"", b"", b"", + b"iso_8859_6:1987", + b"csisolatin6", b"", + b"utf_16le", b"", b"", b"", + b"windows_1257", b"", - b"cswindows1253", + b"csutf16le", b"", + b"utf_16", b"", + b"utf_16be", b"", + b"iso_8859_2", + b"csisolatin3", + b"csutf16", + b"ecma_118", + b"csutf16be", + b"iso_ir_157", + b"csisolatinhebrew", b"", b"", b"", - b"", //b"cp367", + b"extended_unix_code_packed_format_for_japanese", + b"iso_2022_jp", b"", b"", b"", b"", + b"arabic", b"", b"", b"", + b"iso_ir_226", + b"csutf7", b"", + b"ecma_114", b"", + b"iso_8859_2:1987", + b"csisolatin2", b"", b"", b"", b"", b"", - b"cp936", - b"csisolatin3", b"", b"", b"", b"", b"", b"", + b"ms_kanji", b"", + b"iso_8859_7", + b"ibm819", b"", b"", b"", + b"iso_8859_7:1987", + b"ibm866", b"", b"", b"", + b"ms936", + b"csmacintosh", b"", - b"csiso885913", b"", b"", + b"utf_7", + b"ibm850", b"", b"", b"", + b"iso_ir_127", + b"csiso2022jp", b"", b"", + b"cseucpkdfmtjapanese", b"", b"", b"", b"", b"", - b"ks_c_5601-1989", b"", b"", b"", @@ -568,7 +454,6 @@ static CH_MAP: &[&[u8]; 758] = &[ b"", b"", b"", - b"iso-celtic", b"", b"", b"", @@ -586,6 +471,7 @@ static CH_MAP: &[&[u8]; 758] = &[ b"", b"", b"", + b"cskoi8u", b"", b"", b"", @@ -597,11 +483,10 @@ static CH_MAP: &[&[u8]; 758] = &[ b"", b"", b"", + b"macintosh", b"", b"", - b"csisolatinarabic", b"", - b"csisolatincyrillic", b"", b"", b"", @@ -657,16 +542,15 @@ static CH_MAP: &[&[u8]; 758] = &[ b"", b"", b"", + b"asmo_708", b"", b"", b"", b"", b"", - b"ks_c_5601-1987", b"", b"", b"", - b"cstis620", b"", b"", b"", @@ -674,7 +558,6 @@ static CH_MAP: &[&[u8]; 758] = &[ b"", b"", b"", - b"csiso2022jp", b"", b"", b"", @@ -707,7 +590,6 @@ static CH_MAP: &[&[u8]; 758] = &[ b"", b"", b"", - b"cspc850multilingual", b"", b"", b"", @@ -771,379 +653,39 @@ static CH_MAP: &[&[u8]; 758] = &[ b"", b"", b"", - b"", - b"", - b"", - b"", - b"", - b"cyrillic", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"", - b"csgb18030", + b"cspc850multilingual", ]; -#[allow(clippy::type_complexity)] -static FNC_MAP: &[for<'x> fn(&'x [u8]) -> String; 758] = &[ - decoder_iso_8859_14, - no_op, - no_op, - no_op, - decoder_iso_8859_14, - decoder_iso_8859_1, - no_op, - no_op, - decoder_utf8, - decoder_iso_8859_1, - decoder_utf8, - no_op, - no_op, - no_op, - decoder_koi8_r, - decoder_iso_8859_9, - decoder_utf8, - no_op, - no_op, - decoder_iso_8859_9, - no_op, - decoder_shift_jis, - decoder_shift_jis, - no_op, - decoder_koi8_u, - no_op, - no_op, - decoder_big5, - no_op, - decoder_iso_8859_1, - no_op, - no_op, - no_op, - no_op, - decoder_euc_kr, - decoder_iso_8859_4, - no_op, - no_op, - no_op, - decoder_iso_8859_4, - no_op, - decoder_ibm866, - no_op, - decoder_iso_8859_9, - decoder_ibm866, - decoder_iso_8859_10, - decoder_iso_8859_7, - no_op, - decoder_iso_8859_8, - decoder_iso_8859_10, - no_op, - no_op, - no_op, - no_op, - decoder_utf16, - no_op, - no_op, - no_op, - decoder_iso_8859_1, - decoder_tis_620, - no_op, - no_op, - no_op, - decoder_iso_8859_8, - decoder_euc_jp, - decoder_iso_8859_15, - no_op, - decoder_utf8, - decoder_iso_8859_8, - decoder_iso_8859_15, - no_op, - no_op, - no_op, - decoder_iso_8859_1, - decoder_iso_8859_14, - no_op, - no_op, - no_op, - decoder_iso_8859_5, - decoder_iso_8859_16, - no_op, - decoder_utf16_be, - no_op, - decoder_iso_8859_5, - decoder_iso_8859_15, - no_op, - decoder_utf16_le, - no_op, - no_op, - decoder_iso_8859_14, - decoder_iso_8859_2, - decoder_utf8, - no_op, - decoder_iso_8859_5, - decoder_iso_8859_2, - no_op, - no_op, - no_op, - decoder_iso_8859_14, - decoder_iso_8859_16, - no_op, - no_op, - no_op, - decoder_iso_8859_4, - decoder_iso_8859_14, - no_op, - no_op, - no_op, - decoder_iso_8859_9, - no_op, - no_op, - no_op, - no_op, - decoder_iso_8859_5, - decoder_iso_8859_16, - no_op, - decoder_iso_8859_6, - no_op, - decoder_iso_8859_4, - decoder_iso_8859_8, - no_op, - decoder_ibm_850, - no_op, - decoder_iso_8859_9, - decoder_ibm_850, - decoder_cp1258, - decoder_iso_8859_16, - no_op, - decoder_iso_8859_9, - decoder_utf8, - decoder_cp1251, - decoder_iso_8859_6, - no_op, - decoder_iso_8859_4, - no_op, - no_op, - decoder_iso_8859_7, - no_op, - decoder_iso_8859_6, - no_op, - decoder_cp1255, - no_op, - no_op, - decoder_iso_8859_2, - no_op, - no_op, - decoder_gbk, - no_op, - decoder_utf7, - no_op, - no_op, - no_op, - no_op, - decoder_iso_8859_6, - no_op, - decoder_iso_8859_3, - no_op, - no_op, - no_op, - decoder_iso_8859_3, - decoder_cp1254, - no_op, - no_op, - decoder_iso_8859_8, - decoder_iso_8859_10, - no_op, - no_op, - no_op, - no_op, - decoder_iso_8859_7, - decoder_cp1256, - no_op, - no_op, - no_op, - no_op, - no_op, - no_op, - no_op, - no_op, - decoder_iso_8859_10, - no_op, - no_op, - no_op, - decoder_iso_8859_7, - no_op, - no_op, - no_op, - no_op, - decoder_iso_8859_7, - no_op, - no_op, - no_op, - no_op, - decoder_iso_8859_3, - no_op, - no_op, - no_op, - no_op, - decoder_gbk, - no_op, - no_op, - no_op, - no_op, - no_op, - decoder_windows874, - no_op, - no_op, - no_op, - no_op, - decoder_iso_8859_13, - no_op, - no_op, - no_op, - decoder_iso_8859_1, - no_op, - decoder_cp1252, - no_op, - no_op, - decoder_euc_jp, - decoder_iso2022_jp, - no_op, - decoder_macintosh, - no_op, - decoder_iso_8859_3, - no_op, - decoder_koi8_r, - no_op, +static FNC_MAP: &[for<'x> fn(&'x [u8]) -> String; 545] = &[ no_op, - decoder_iso_8859_2, - decoder_iso_8859_6, - decoder_utf8, no_op, + decoder_iso_8859_1, no_op, no_op, - decoder_utf8, - decoder_euc_kr, no_op, - decoder_macintosh, - decoder_gbk, - decoder_big5, + decoder_iso_8859_1, + decoder_iso_8859_14, no_op, no_op, no_op, - decoder_iso_8859_2, + decoder_iso_8859_14, no_op, - decoder_koi8_u, - decoder_cp1258, no_op, no_op, no_op, - decoder_cp1250, - decoder_cp1251, no_op, - decoder_iso_8859_6, + decoder_iso_8859_4, no_op, - decoder_iso_8859_16, no_op, - decoder_utf8, - decoder_iso_8859_1, decoder_gbk, - decoder_tis_620, - decoder_cp1255, - no_op, decoder_iso_8859_4, no_op, - decoder_cp1257, - no_op, - no_op, - no_op, - no_op, - no_op, - no_op, + decoder_gbk, no_op, - decoder_iso_8859_16, - decoder_iso_8859_1, - decoder_windows874, no_op, no_op, + decoder_iso_8859_9, no_op, - decoder_iso_8859_8, - decoder_cp1253, - decoder_cp1254, no_op, no_op, decoder_iso_8859_9, @@ -1151,160 +693,163 @@ static FNC_MAP: &[for<'x> fn(&'x [u8]) -> String; 758] = &[ no_op, no_op, decoder_iso_8859_7, + decoder_iso_8859_7, no_op, no_op, - decoder_cp1256, - no_op, - no_op, - decoder_utf8, - no_op, - no_op, - no_op, - decoder_iso_8859_2, - decoder_iso_8859_15, no_op, no_op, - decoder_utf8, + decoder_big5, decoder_iso_8859_10, - decoder_iso_8859_4, no_op, no_op, - no_op, - no_op, - no_op, - no_op, - no_op, - decoder_euc_jp, - decoder_ibm866, + decoder_iso_8859_1, decoder_iso_8859_10, no_op, no_op, - no_op, - no_op, - decoder_iso_8859_14, - no_op, - no_op, - no_op, - no_op, - no_op, - decoder_utf16, - no_op, - no_op, - decoder_iso_8859_7, - decoder_iso_8859_16, - no_op, + decoder_big5, + decoder_ibm866, no_op, no_op, no_op, no_op, + decoder_shift_jis, no_op, - decoder_cp1252, + decoder_iso_8859_3, no_op, no_op, + decoder_ibm_850, + decoder_iso_8859_3, no_op, no_op, no_op, + decoder_iso_8859_9, no_op, - decoder_iso_8859_7, no_op, no_op, no_op, + decoder_iso_8859_14, no_op, - decoder_iso_8859_7, no_op, no_op, no_op, - decoder_utf16_be, no_op, + decoder_euc_kr, + decoder_iso_8859_15, no_op, no_op, + decoder_gbk, no_op, - decoder_utf16_le, - decoder_iso_8859_3, decoder_iso_8859_2, no_op, - decoder_ibm866, - no_op, - decoder_ibm_850, - no_op, - no_op, - no_op, - no_op, decoder_shift_jis, + decoder_iso_8859_5, + decoder_iso_8859_2, no_op, + decoder_ibm866, no_op, - decoder_cp1250, - no_op, - decoder_iso_8859_3, - decoder_utf7, + decoder_iso_8859_2, + decoder_euc_jp, no_op, + decoder_iso_8859_5, no_op, no_op, - decoder_iso_8859_6, - decoder_gb18030, + decoder_koi8_r, no_op, no_op, no_op, decoder_iso_8859_1, - decoder_macintosh, - decoder_gb18030, - decoder_cp1257, - no_op, - no_op, - no_op, - no_op, - no_op, - no_op, - no_op, - no_op, - no_op, - no_op, - no_op, - no_op, - no_op, + decoder_tis_620, no_op, - decoder_cp1253, + decoder_iso_8859_16, no_op, + decoder_iso_8859_8, + decoder_iso_8859_16, + decoder_iso_8859_16, + decoder_ibm_850, no_op, + decoder_iso_8859_8, + decoder_iso_8859_14, + decoder_tis_620, + decoder_iso_8859_7, no_op, + decoder_iso_8859_8, + decoder_iso_8859_14, + decoder_gb18030, no_op, no_op, no_op, - decoder_utf8, + decoder_iso_8859_14, + decoder_koi8_r, no_op, no_op, + decoder_iso_8859_4, + decoder_iso_8859_15, no_op, + decoder_cp1251, no_op, + decoder_iso_8859_4, + decoder_iso_8859_15, + decoder_euc_kr, + decoder_cp1258, no_op, + decoder_iso_8859_5, + decoder_iso_8859_8, no_op, no_op, no_op, + decoder_iso_8859_4, + decoder_iso_8859_16, no_op, + decoder_cp1254, + decoder_euc_kr, + decoder_iso_8859_14, + decoder_iso_8859_16, + decoder_windows874, no_op, no_op, + decoder_iso_8859_5, + decoder_iso_8859_10, + decoder_cp1251, + decoder_cp1255, no_op, no_op, + decoder_iso_8859_13, + decoder_cp1258, + decoder_ibm866, no_op, - decoder_gbk, decoder_iso_8859_3, + decoder_iso_8859_13, no_op, no_op, no_op, + decoder_iso_8859_3, no_op, + decoder_cp1254, + decoder_cp1256, no_op, + decoder_iso_8859_9, no_op, no_op, no_op, no_op, + decoder_iso_8859_9, + decoder_gb18030, + decoder_cp1255, + decoder_cp1250, no_op, + decoder_iso_8859_6, no_op, no_op, + decoder_cp1253, no_op, no_op, - decoder_iso_8859_13, - no_op, + decoder_iso_8859_10, no_op, + decoder_tis_620, no_op, + decoder_iso_8859_1, + decoder_windows874, + decoder_cp1256, no_op, no_op, no_op, @@ -1312,29 +857,40 @@ static FNC_MAP: &[for<'x> fn(&'x [u8]) -> String; 758] = &[ no_op, no_op, no_op, + decoder_iso_8859_7, + decoder_gbk, + decoder_cp1250, no_op, no_op, + decoder_iso_8859_1, + decoder_iso_8859_1, + decoder_cp1253, + decoder_cp1252, decoder_euc_kr, + decoder_iso_8859_3, + decoder_iso_8859_6, no_op, + decoder_iso_8859_5, no_op, + decoder_iso_8859_7, + decoder_koi8_u, no_op, no_op, no_op, no_op, + decoder_iso_8859_4, no_op, no_op, + decoder_gb18030, no_op, no_op, no_op, + decoder_cp1257, no_op, no_op, - no_op, - no_op, - decoder_iso_8859_14, - no_op, - no_op, - no_op, - no_op, + decoder_iso_8859_9, + decoder_cp1252, + decoder_macintosh, no_op, no_op, no_op, @@ -1346,32 +902,48 @@ static FNC_MAP: &[for<'x> fn(&'x [u8]) -> String; 758] = &[ no_op, no_op, no_op, + decoder_iso_8859_6, + decoder_iso_8859_10, no_op, + decoder_utf16_le, no_op, no_op, no_op, + decoder_cp1257, no_op, + decoder_utf16_le, no_op, + decoder_utf16, no_op, + decoder_utf16_be, no_op, + decoder_iso_8859_2, + decoder_iso_8859_3, + decoder_utf16, + decoder_iso_8859_7, + decoder_utf16_be, + decoder_iso_8859_10, + decoder_iso_8859_8, no_op, no_op, no_op, + decoder_euc_jp, + decoder_iso2022_jp, no_op, no_op, no_op, no_op, decoder_iso_8859_6, no_op, - decoder_iso_8859_5, - no_op, - no_op, - no_op, - no_op, no_op, no_op, + decoder_iso_8859_16, + decoder_utf7, no_op, + decoder_iso_8859_6, no_op, + decoder_iso_8859_2, + decoder_iso_8859_2, no_op, no_op, no_op, @@ -1383,21 +955,33 @@ static FNC_MAP: &[for<'x> fn(&'x [u8]) -> String; 758] = &[ no_op, no_op, no_op, + decoder_shift_jis, no_op, + decoder_iso_8859_7, + decoder_iso_8859_1, no_op, no_op, no_op, + decoder_iso_8859_7, + decoder_ibm866, no_op, no_op, no_op, + decoder_gbk, + decoder_macintosh, no_op, no_op, no_op, + decoder_utf7, + decoder_ibm_850, no_op, no_op, no_op, + decoder_iso_8859_6, + decoder_iso2022_jp, no_op, no_op, + decoder_euc_jp, no_op, no_op, no_op, @@ -1424,11 +1008,9 @@ static FNC_MAP: &[for<'x> fn(&'x [u8]) -> String; 758] = &[ no_op, no_op, no_op, - decoder_euc_kr, no_op, no_op, no_op, - decoder_tis_620, no_op, no_op, no_op, @@ -1436,8 +1018,8 @@ static FNC_MAP: &[for<'x> fn(&'x [u8]) -> String; 758] = &[ no_op, no_op, no_op, - decoder_iso2022_jp, no_op, + decoder_koi8_u, no_op, no_op, no_op, @@ -1449,6 +1031,7 @@ static FNC_MAP: &[for<'x> fn(&'x [u8]) -> String; 758] = &[ no_op, no_op, no_op, + decoder_macintosh, no_op, no_op, no_op, @@ -1469,7 +1052,6 @@ static FNC_MAP: &[for<'x> fn(&'x [u8]) -> String; 758] = &[ no_op, no_op, no_op, - decoder_ibm_850, no_op, no_op, no_op, @@ -1508,6 +1090,7 @@ static FNC_MAP: &[for<'x> fn(&'x [u8]) -> String; 758] = &[ no_op, no_op, no_op, + decoder_iso_8859_6, no_op, no_op, no_op, @@ -1538,7 +1121,6 @@ static FNC_MAP: &[for<'x> fn(&'x [u8]) -> String; 758] = &[ no_op, no_op, no_op, - decoder_iso_8859_5, no_op, no_op, no_op, @@ -1619,32 +1201,23 @@ static FNC_MAP: &[for<'x> fn(&'x [u8]) -> String; 758] = &[ no_op, no_op, no_op, - decoder_gb18030, + decoder_ibm_850, ]; #[cfg(test)] mod tests { - use super::charset_decoder; + use super::{charset_decoder, CH_MAP}; #[test] - #[allow(clippy::uninlined_format_args)] fn decoder_charset() { - let inputs = [ - "l8", - //"utf-8", - "utf-7", - //"US-Ascii", - "csgb18030", - "iso-8859-1", - "extended_unix_code_packed_format_for_japanese", - ]; - - for input in inputs { - assert!( - charset_decoder(input.as_bytes()).is_some(), - "Failed for {}", - input - ); + for input in CH_MAP { + if !input.is_empty() { + assert!( + charset_decoder(input).is_some(), + "Failed for {}", + std::str::from_utf8(input).unwrap() + ); + } } } }