From 9fb37c306e312b51f34ff8be4276418cfc4ee33d Mon Sep 17 00:00:00 2001 From: Benjamin Date: Thu, 22 Aug 2024 18:14:46 +0200 Subject: [PATCH] Added IBM850 (CP850) charset for german language with test --- src/Core/CodepageName.cs | 12 +- .../SingleByte/German/Ibm850_GermanModel.cs | 50 ++++++ src/Core/Probers/SBCSGroupProber.cs | 161 +++++++++--------- tests/Data/ibm850/lang_de_ibm850.txt | 3 + tests/UTF-unknown.Tests.csproj | 4 + 5 files changed, 148 insertions(+), 82 deletions(-) create mode 100644 src/Core/Models/SingleByte/German/Ibm850_GermanModel.cs create mode 100644 tests/Data/ibm850/lang_de_ibm850.txt diff --git a/src/Core/CodepageName.cs b/src/Core/CodepageName.cs index 391f473..4b48385 100644 --- a/src/Core/CodepageName.cs +++ b/src/Core/CodepageName.cs @@ -1,4 +1,4 @@ -/* ***** BEGIN LICENSE BLOCK ***** +/* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version @@ -166,6 +166,14 @@ internal static class CodepageName /// internal const string CP949 = "cp949"; + /// + /// OEM Latin-1 codepage name. + /// + /// + /// Is other alias cp850 in .NET + /// + internal const string IBM850 = "ibm850"; + /// /// OEM Latin-2 codepage name. /// @@ -401,4 +409,4 @@ internal static class CodepageName /// internal const string X_ISO_10646_UCS_4_2143 = "X-ISO-10646-UCS-4-2143"; } -} \ No newline at end of file +} diff --git a/src/Core/Models/SingleByte/German/Ibm850_GermanModel.cs b/src/Core/Models/SingleByte/German/Ibm850_GermanModel.cs new file mode 100644 index 0000000..25a9fb5 --- /dev/null +++ b/src/Core/Models/SingleByte/German/Ibm850_GermanModel.cs @@ -0,0 +1,50 @@ + +namespace UtfUnknown.Core.Models.SingleByte.German +{ + public class Ibm850_GermanModel : GermanModel + { + // Generated by BuildLangModel.py + // On: 2016-09-21 03:28:11.733089 + + // Character Mapping Table: + // ILL: illegal character. + // CTR: control character specific to the charset. + // RET: carriage/return. + // SYM: symbol (punctuation) that does not belong to word. + // NUM: 0 - 9. + + // Other characters are ordered by probabilities + // (0 is the most common character in the language). + + // Orders are generic to a language. So the codepoint with order X in + // CHARSET1 maps to the same character as the codepoint with the same + // order X in CHARSET2 for the same language. + // As such, it is possible to get missing order. For instance the + // ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + // even though they are both used for French. Same for the euro sign. + + private static byte[] CHAR_TO_ORDER_MAP = { + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 4X */ + 18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 6X */ + 18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 35, 24, 29, 37, 22, 41, 49, 35, 48, 43, 32, 52, 47, 57, 44, 49, /* 8X */ + 32, 50, 50, 40, 26, 51, 63, 58, 56, 26, 24, 38,SYM, 38,SYM, 59, /* 9X */ + 31, 33, 34, 46, 39, 39,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 31, 37, 41,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM, 44, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + 53, 53, 48, 43, 32,SYM, 33, 47, 52,SYM,SYM,SYM,SYM,SYM, 57,SYM, /* DX */ + 34, 27, 40, 51, 55, 55, 60, 62, 64, 46, 61, 58, 45, 45,SYM,SYM, /* EX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* FX */ + }; + /* X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + public Ibm850_GermanModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM850) + { + } + } +} diff --git a/src/Core/Probers/SBCSGroupProber.cs b/src/Core/Probers/SBCSGroupProber.cs index 5263bfb..585cfae 100644 --- a/src/Core/Probers/SBCSGroupProber.cs +++ b/src/Core/Probers/SBCSGroupProber.cs @@ -1,4 +1,4 @@ -/* ***** BEGIN LICENSE BLOCK ***** +/* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version @@ -76,7 +76,7 @@ namespace UtfUnknown.Core.Probers { public class SBCSGroupProber : CharsetProber { - private const int PROBERS_NUM = 100; + private const int PROBERS_NUM = 101; private CharsetProber[] probers = new CharsetProber[PROBERS_NUM]; private bool[] isActive = new bool[PROBERS_NUM]; private int bestGuess; @@ -131,123 +131,124 @@ public SBCSGroupProber() probers[22] = new SingleByteCharSetProber(new Windows_1250_HungarianModel()); // German - probers[23] = new SingleByteCharSetProber(new Iso_8859_1_GermanModel()); - probers[24] = new SingleByteCharSetProber(new Windows_1252_GermanModel()); + probers[23] = new SingleByteCharSetProber(new Ibm850_GermanModel()); + probers[24] = new SingleByteCharSetProber(new Iso_8859_1_GermanModel()); + probers[25] = new SingleByteCharSetProber(new Windows_1252_GermanModel()); // Esperanto - probers[25] = new SingleByteCharSetProber(new Iso_8859_3_EsperantoModel()); + probers[26] = new SingleByteCharSetProber(new Iso_8859_3_EsperantoModel()); // Turkish - probers[26] = new SingleByteCharSetProber(new Iso_8859_3_TurkishModel()); - probers[27] = new SingleByteCharSetProber(new Iso_8859_9_TurkishModel()); + probers[27] = new SingleByteCharSetProber(new Iso_8859_3_TurkishModel()); + probers[28] = new SingleByteCharSetProber(new Iso_8859_9_TurkishModel()); // Arabic - probers[28] = new SingleByteCharSetProber(new Iso_8859_6_ArabicModel()); - probers[29] = new SingleByteCharSetProber(new Windows_1256_ArabicModel()); + probers[29] = new SingleByteCharSetProber(new Iso_8859_6_ArabicModel()); + probers[30] = new SingleByteCharSetProber(new Windows_1256_ArabicModel()); // Vietnamese - probers[30] = new SingleByteCharSetProber(new Viscii_VietnameseModel()); - probers[31] = new SingleByteCharSetProber(new Windows_1258_VietnameseModel()); + probers[31] = new SingleByteCharSetProber(new Viscii_VietnameseModel()); + probers[32] = new SingleByteCharSetProber(new Windows_1258_VietnameseModel()); // Danish - probers[32] = new SingleByteCharSetProber(new Iso_8859_15_DanishModel()); - probers[33] = new SingleByteCharSetProber(new Iso_8859_1_DanishModel()); - probers[34] = new SingleByteCharSetProber(new Windows_1252_DanishModel()); + probers[33] = new SingleByteCharSetProber(new Iso_8859_15_DanishModel()); + probers[34] = new SingleByteCharSetProber(new Iso_8859_1_DanishModel()); + probers[35] = new SingleByteCharSetProber(new Windows_1252_DanishModel()); // Lithuanian - probers[35] = new SingleByteCharSetProber(new Iso_8859_13_LithuanianModel()); - probers[36] = new SingleByteCharSetProber(new Iso_8859_10_LithuanianModel()); - probers[37] = new SingleByteCharSetProber(new Iso_8859_4_LithuanianModel()); + probers[36] = new SingleByteCharSetProber(new Iso_8859_13_LithuanianModel()); + probers[37] = new SingleByteCharSetProber(new Iso_8859_10_LithuanianModel()); + probers[38] = new SingleByteCharSetProber(new Iso_8859_4_LithuanianModel()); // Latvian - probers[38] = new SingleByteCharSetProber(new Iso_8859_13_LatvianModel()); - probers[39] = new SingleByteCharSetProber(new Iso_8859_10_LatvianModel()); - probers[40] = new SingleByteCharSetProber(new Iso_8859_4_LatvianModel()); + probers[39] = new SingleByteCharSetProber(new Iso_8859_13_LatvianModel()); + probers[40] = new SingleByteCharSetProber(new Iso_8859_10_LatvianModel()); + probers[41] = new SingleByteCharSetProber(new Iso_8859_4_LatvianModel()); // Portuguese - probers[41] = new SingleByteCharSetProber(new Iso_8859_1_PortugueseModel()); - probers[42] = new SingleByteCharSetProber(new Iso_8859_9_PortugueseModel()); - probers[43] = new SingleByteCharSetProber(new Iso_8859_15_PortugueseModel()); - probers[44] = new SingleByteCharSetProber(new Windows_1252_PortugueseModel()); + probers[42] = new SingleByteCharSetProber(new Iso_8859_1_PortugueseModel()); + probers[43] = new SingleByteCharSetProber(new Iso_8859_9_PortugueseModel()); + probers[44] = new SingleByteCharSetProber(new Iso_8859_15_PortugueseModel()); + probers[45] = new SingleByteCharSetProber(new Windows_1252_PortugueseModel()); // Maltese - probers[45] = new SingleByteCharSetProber(new Iso_8859_3_MalteseModel()); + probers[46] = new SingleByteCharSetProber(new Iso_8859_3_MalteseModel()); // Czech - probers[46] = new SingleByteCharSetProber(new Windows_1250_CzechModel()); - probers[47] = new SingleByteCharSetProber(new Iso_8859_2_CzechModel()); - probers[48] = new SingleByteCharSetProber(new Mac_Centraleurope_CzechModel()); - probers[49] = new SingleByteCharSetProber(new Ibm852_CzechModel()); + probers[47] = new SingleByteCharSetProber(new Windows_1250_CzechModel()); + probers[48] = new SingleByteCharSetProber(new Iso_8859_2_CzechModel()); + probers[49] = new SingleByteCharSetProber(new Mac_Centraleurope_CzechModel()); + probers[50] = new SingleByteCharSetProber(new Ibm852_CzechModel()); // Slovak - probers[50] = new SingleByteCharSetProber(new Windows_1250_SlovakModel()); - probers[51] = new SingleByteCharSetProber(new Iso_8859_2_SlovakModel()); - probers[52] = new SingleByteCharSetProber(new Mac_Centraleurope_SlovakModel()); - probers[53] = new SingleByteCharSetProber(new Ibm852_SlovakModel()); + probers[51] = new SingleByteCharSetProber(new Windows_1250_SlovakModel()); + probers[52] = new SingleByteCharSetProber(new Iso_8859_2_SlovakModel()); + probers[53] = new SingleByteCharSetProber(new Mac_Centraleurope_SlovakModel()); + probers[54] = new SingleByteCharSetProber(new Ibm852_SlovakModel()); // Polish - probers[54] = new SingleByteCharSetProber(new Windows_1250_PolishModel()); - probers[55] = new SingleByteCharSetProber(new Iso_8859_2_PolishModel()); - probers[56] = new SingleByteCharSetProber(new Iso_8859_13_PolishModel()); - probers[57] = new SingleByteCharSetProber(new Iso_8859_16_PolishModel()); - probers[58] = new SingleByteCharSetProber(new Mac_Centraleurope_PolishModel()); - probers[59] = new SingleByteCharSetProber(new Ibm852_PolishModel()); + probers[55] = new SingleByteCharSetProber(new Windows_1250_PolishModel()); + probers[56] = new SingleByteCharSetProber(new Iso_8859_2_PolishModel()); + probers[57] = new SingleByteCharSetProber(new Iso_8859_13_PolishModel()); + probers[58] = new SingleByteCharSetProber(new Iso_8859_16_PolishModel()); + probers[59] = new SingleByteCharSetProber(new Mac_Centraleurope_PolishModel()); + probers[60] = new SingleByteCharSetProber(new Ibm852_PolishModel()); // Finnish - probers[60] = new SingleByteCharSetProber(new Iso_8859_1_FinnishModel()); - probers[61] = new SingleByteCharSetProber(new Iso_8859_4_FinnishModel()); - probers[62] = new SingleByteCharSetProber(new Iso_8859_9_FinnishModel()); - probers[63] = new SingleByteCharSetProber(new Iso_8859_13_FinnishModel()); - probers[64] = new SingleByteCharSetProber(new Iso_8859_15_FinnishModel()); - probers[65] = new SingleByteCharSetProber(new Windows_1252_FinnishModel()); + probers[61] = new SingleByteCharSetProber(new Iso_8859_1_FinnishModel()); + probers[62] = new SingleByteCharSetProber(new Iso_8859_4_FinnishModel()); + probers[63] = new SingleByteCharSetProber(new Iso_8859_9_FinnishModel()); + probers[64] = new SingleByteCharSetProber(new Iso_8859_13_FinnishModel()); + probers[65] = new SingleByteCharSetProber(new Iso_8859_15_FinnishModel()); + probers[66] = new SingleByteCharSetProber(new Windows_1252_FinnishModel()); // Italian - probers[66] = new SingleByteCharSetProber(new Iso_8859_1_ItalianModel()); - probers[67] = new SingleByteCharSetProber(new Iso_8859_3_ItalianModel()); - probers[68] = new SingleByteCharSetProber(new Iso_8859_9_ItalianModel()); - probers[69] = new SingleByteCharSetProber(new Iso_8859_15_ItalianModel()); - probers[70] = new SingleByteCharSetProber(new Windows_1252_ItalianModel()); + probers[67] = new SingleByteCharSetProber(new Iso_8859_1_ItalianModel()); + probers[68] = new SingleByteCharSetProber(new Iso_8859_3_ItalianModel()); + probers[69] = new SingleByteCharSetProber(new Iso_8859_9_ItalianModel()); + probers[70] = new SingleByteCharSetProber(new Iso_8859_15_ItalianModel()); + probers[71] = new SingleByteCharSetProber(new Windows_1252_ItalianModel()); // Croatian - probers[71] = new SingleByteCharSetProber(new Windows_1250_CroatianModel()); - probers[72] = new SingleByteCharSetProber(new Iso_8859_2_CroatianModel()); - probers[73] = new SingleByteCharSetProber(new Iso_8859_13_CroatianModel()); - probers[74] = new SingleByteCharSetProber(new Iso_8859_16_CroatianModel()); - probers[75] = new SingleByteCharSetProber(new Mac_Centraleurope_CroatianModel()); - probers[76] = new SingleByteCharSetProber(new Ibm852_CroatianModel()); + probers[72] = new SingleByteCharSetProber(new Windows_1250_CroatianModel()); + probers[73] = new SingleByteCharSetProber(new Iso_8859_2_CroatianModel()); + probers[74] = new SingleByteCharSetProber(new Iso_8859_13_CroatianModel()); + probers[75] = new SingleByteCharSetProber(new Iso_8859_16_CroatianModel()); + probers[76] = new SingleByteCharSetProber(new Mac_Centraleurope_CroatianModel()); + probers[77] = new SingleByteCharSetProber(new Ibm852_CroatianModel()); // Estonian - probers[77] = new SingleByteCharSetProber(new Windows_1252_EstonianModel()); - probers[78] = new SingleByteCharSetProber(new Windows_1257_EstonianModel()); - probers[79] = new SingleByteCharSetProber(new Iso_8859_4_EstonianModel()); - probers[80] = new SingleByteCharSetProber(new Iso_8859_13_EstonianModel()); - probers[81] = new SingleByteCharSetProber(new Iso_8859_15_EstonianModel()); + probers[78] = new SingleByteCharSetProber(new Windows_1252_EstonianModel()); + probers[79] = new SingleByteCharSetProber(new Windows_1257_EstonianModel()); + probers[80] = new SingleByteCharSetProber(new Iso_8859_4_EstonianModel()); + probers[81] = new SingleByteCharSetProber(new Iso_8859_13_EstonianModel()); + probers[82] = new SingleByteCharSetProber(new Iso_8859_15_EstonianModel()); // Irish - probers[82] = new SingleByteCharSetProber(new Iso_8859_1_IrishModel()); - probers[83] = new SingleByteCharSetProber(new Iso_8859_9_IrishModel()); - probers[84] = new SingleByteCharSetProber(new Iso_8859_15_IrishModel()); - probers[85] = new SingleByteCharSetProber(new Windows_1252_IrishModel()); + probers[83] = new SingleByteCharSetProber(new Iso_8859_1_IrishModel()); + probers[84] = new SingleByteCharSetProber(new Iso_8859_9_IrishModel()); + probers[85] = new SingleByteCharSetProber(new Iso_8859_15_IrishModel()); + probers[86] = new SingleByteCharSetProber(new Windows_1252_IrishModel()); // Romanian - probers[86] = new SingleByteCharSetProber(new Windows_1250_RomanianModel()); - probers[87] = new SingleByteCharSetProber(new Iso_8859_2_RomanianModel()); - probers[88] = new SingleByteCharSetProber(new Iso_8859_16_RomanianModel()); - probers[89] = new SingleByteCharSetProber(new Ibm852_RomanianModel()); + probers[87] = new SingleByteCharSetProber(new Windows_1250_RomanianModel()); + probers[88] = new SingleByteCharSetProber(new Iso_8859_2_RomanianModel()); + probers[89] = new SingleByteCharSetProber(new Iso_8859_16_RomanianModel()); + probers[90] = new SingleByteCharSetProber(new Ibm852_RomanianModel()); // Slovene - probers[90] = new SingleByteCharSetProber(new Windows_1250_SloveneModel()); - probers[91] = new SingleByteCharSetProber(new Iso_8859_2_SloveneModel()); - probers[92] = new SingleByteCharSetProber(new Iso_8859_16_SloveneModel()); - probers[93] = new SingleByteCharSetProber(new Mac_Centraleurope_SloveneModel()); - probers[94] = new SingleByteCharSetProber(new Ibm852_SloveneModel()); + probers[91] = new SingleByteCharSetProber(new Windows_1250_SloveneModel()); + probers[92] = new SingleByteCharSetProber(new Iso_8859_2_SloveneModel()); + probers[93] = new SingleByteCharSetProber(new Iso_8859_16_SloveneModel()); + probers[94] = new SingleByteCharSetProber(new Mac_Centraleurope_SloveneModel()); + probers[95] = new SingleByteCharSetProber(new Ibm852_SloveneModel()); // Swedish - probers[95] = new SingleByteCharSetProber(new Iso_8859_1_SwedishModel()); - probers[96] = new SingleByteCharSetProber(new Iso_8859_4_SwedishModel()); - probers[97] = new SingleByteCharSetProber(new Iso_8859_9_SwedishModel()); - probers[98] = new SingleByteCharSetProber(new Iso_8859_15_SwedishModel()); - probers[99] = new SingleByteCharSetProber(new Windows_1252_SwedishModel()); + probers[96] = new SingleByteCharSetProber(new Iso_8859_1_SwedishModel()); + probers[97] = new SingleByteCharSetProber(new Iso_8859_4_SwedishModel()); + probers[98] = new SingleByteCharSetProber(new Iso_8859_9_SwedishModel()); + probers[99] = new SingleByteCharSetProber(new Iso_8859_15_SwedishModel()); + probers[100] = new SingleByteCharSetProber(new Windows_1252_SwedishModel()); Reset(); } @@ -410,4 +411,4 @@ public override string GetCharsetName() return probers[bestGuess].GetCharsetName(); } } -} \ No newline at end of file +} diff --git a/tests/Data/ibm850/lang_de_ibm850.txt b/tests/Data/ibm850/lang_de_ibm850.txt new file mode 100644 index 0000000..82359e0 --- /dev/null +++ b/tests/Data/ibm850/lang_de_ibm850.txt @@ -0,0 +1,3 @@ +Auch gibt es niemanden, der den Schmerz an sich liebt, sucht oder wnscht, nur, weil er Schmerz ist, es sei denn, es kommt zu zuflligen Umstnden, in denen Mhen und Schmerz ihm groe Freude bereiten knnen. +Um ein triviales Beispiel zu nehmen, wer von uns unterzieht sich je anstrengender krperlicher Bettigung, auer um Vorteile daraus zu ziehen? +Aber wer hat irgend ein Recht, einen Menschen zu tadeln, der die Entscheidung trifft, eine Freude zu genieen, die keine unangenehmen Folgen hat, oder einen, der Schmerz vermeidet, welcher keine daraus resultierende Freude nach sich zieht? diff --git a/tests/UTF-unknown.Tests.csproj b/tests/UTF-unknown.Tests.csproj index 87ba6bf..9ba3cee 100644 --- a/tests/UTF-unknown.Tests.csproj +++ b/tests/UTF-unknown.Tests.csproj @@ -19,4 +19,8 @@ + + + +