Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added IBM850 (CP850) charset for german language with test #171

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions src/Core/CodepageName.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* ***** BEGIN LICENSE BLOCK *****
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
Expand Down Expand Up @@ -166,6 +166,14 @@ internal static class CodepageName
/// </remarks>
internal const string CP949 = "cp949";

/// <summary>
/// OEM Latin-1 codepage name.
/// </summary>
/// <remarks>
/// Is other alias cp850 in .NET
/// </remarks>
internal const string IBM850 = "ibm850";

/// <summary>
/// OEM Latin-2 codepage name.
/// </summary>
Expand Down Expand Up @@ -401,4 +409,4 @@ internal static class CodepageName
/// </remarks>
internal const string X_ISO_10646_UCS_4_2143 = "X-ISO-10646-UCS-4-2143";
}
}
}
50 changes: 50 additions & 0 deletions src/Core/Models/SingleByte/German/Ibm850_GermanModel.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

namespace UtfUnknown.Core.Models.SingleByte.German
{
public class Ibm850_GermanModel : GermanModel
{
// Generated by BuildLangModel.py
// On: 2016-09-21 03:28:11.733089

// Character Mapping Table:
// ILL: illegal character.
// CTR: control character specific to the charset.
// RET: carriage/return.
// SYM: symbol (punctuation) that does not belong to word.
// NUM: 0 - 9.

// Other characters are ordered by probabilities
// (0 is the most common character in the language).

// Orders are generic to a language. So the codepoint with order X in
// CHARSET1 maps to the same character as the codepoint with the same
// order X in CHARSET2 for the same language.
// As such, it is possible to get missing order. For instance the
// ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
// even though they are both used for French. Same for the euro sign.

private static byte[] CHAR_TO_ORDER_MAP = {
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 4X */
18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,SYM, /* 5X */
SYM, 5, 15, 12, 8, 0, 17, 14, 7, 3, 23, 16, 9, 13, 2, 11, /* 6X */
18, 30, 1, 4, 6, 10, 21, 19, 28, 25, 20,SYM,SYM,SYM,SYM,CTR, /* 7X */
35, 24, 29, 37, 22, 41, 49, 35, 48, 43, 32, 52, 47, 57, 44, 49, /* 8X */
32, 50, 50, 40, 26, 51, 63, 58, 56, 26, 24, 38,SYM, 38,SYM, 59, /* 9X */
31, 33, 34, 46, 39, 39,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
SYM,SYM,SYM,SYM,SYM, 31, 37, 41,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
SYM,SYM,SYM,SYM,SYM,SYM, 44, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
53, 53, 48, 43, 32,SYM, 33, 47, 52,SYM,SYM,SYM,SYM,SYM, 57,SYM, /* DX */
34, 27, 40, 51, 55, 55, 60, 62, 64, 46, 61, 58, 45, 45,SYM,SYM, /* EX */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* FX */
};
/* X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */

public Ibm850_GermanModel() : base(CHAR_TO_ORDER_MAP, CodepageName.IBM850)
{
}
}
}
161 changes: 81 additions & 80 deletions src/Core/Probers/SBCSGroupProber.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* ***** BEGIN LICENSE BLOCK *****
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
Expand Down Expand Up @@ -76,7 +76,7 @@ namespace UtfUnknown.Core.Probers
{
public class SBCSGroupProber : CharsetProber
{
private const int PROBERS_NUM = 100;
private const int PROBERS_NUM = 101;
private CharsetProber[] probers = new CharsetProber[PROBERS_NUM];
private bool[] isActive = new bool[PROBERS_NUM];
private int bestGuess;
Expand Down Expand Up @@ -131,123 +131,124 @@ public SBCSGroupProber()
probers[22] = new SingleByteCharSetProber(new Windows_1250_HungarianModel());

// German
probers[23] = new SingleByteCharSetProber(new Iso_8859_1_GermanModel());
probers[24] = new SingleByteCharSetProber(new Windows_1252_GermanModel());
probers[23] = new SingleByteCharSetProber(new Ibm850_GermanModel());
probers[24] = new SingleByteCharSetProber(new Iso_8859_1_GermanModel());
probers[25] = new SingleByteCharSetProber(new Windows_1252_GermanModel());

// Esperanto
probers[25] = new SingleByteCharSetProber(new Iso_8859_3_EsperantoModel());
probers[26] = new SingleByteCharSetProber(new Iso_8859_3_EsperantoModel());

// Turkish
probers[26] = new SingleByteCharSetProber(new Iso_8859_3_TurkishModel());
probers[27] = new SingleByteCharSetProber(new Iso_8859_9_TurkishModel());
probers[27] = new SingleByteCharSetProber(new Iso_8859_3_TurkishModel());
probers[28] = new SingleByteCharSetProber(new Iso_8859_9_TurkishModel());

// Arabic
probers[28] = new SingleByteCharSetProber(new Iso_8859_6_ArabicModel());
probers[29] = new SingleByteCharSetProber(new Windows_1256_ArabicModel());
probers[29] = new SingleByteCharSetProber(new Iso_8859_6_ArabicModel());
probers[30] = new SingleByteCharSetProber(new Windows_1256_ArabicModel());

// Vietnamese
probers[30] = new SingleByteCharSetProber(new Viscii_VietnameseModel());
probers[31] = new SingleByteCharSetProber(new Windows_1258_VietnameseModel());
probers[31] = new SingleByteCharSetProber(new Viscii_VietnameseModel());
probers[32] = new SingleByteCharSetProber(new Windows_1258_VietnameseModel());

// Danish
probers[32] = new SingleByteCharSetProber(new Iso_8859_15_DanishModel());
probers[33] = new SingleByteCharSetProber(new Iso_8859_1_DanishModel());
probers[34] = new SingleByteCharSetProber(new Windows_1252_DanishModel());
probers[33] = new SingleByteCharSetProber(new Iso_8859_15_DanishModel());
probers[34] = new SingleByteCharSetProber(new Iso_8859_1_DanishModel());
probers[35] = new SingleByteCharSetProber(new Windows_1252_DanishModel());

// Lithuanian
probers[35] = new SingleByteCharSetProber(new Iso_8859_13_LithuanianModel());
probers[36] = new SingleByteCharSetProber(new Iso_8859_10_LithuanianModel());
probers[37] = new SingleByteCharSetProber(new Iso_8859_4_LithuanianModel());
probers[36] = new SingleByteCharSetProber(new Iso_8859_13_LithuanianModel());
probers[37] = new SingleByteCharSetProber(new Iso_8859_10_LithuanianModel());
probers[38] = new SingleByteCharSetProber(new Iso_8859_4_LithuanianModel());

// Latvian
probers[38] = new SingleByteCharSetProber(new Iso_8859_13_LatvianModel());
probers[39] = new SingleByteCharSetProber(new Iso_8859_10_LatvianModel());
probers[40] = new SingleByteCharSetProber(new Iso_8859_4_LatvianModel());
probers[39] = new SingleByteCharSetProber(new Iso_8859_13_LatvianModel());
probers[40] = new SingleByteCharSetProber(new Iso_8859_10_LatvianModel());
probers[41] = new SingleByteCharSetProber(new Iso_8859_4_LatvianModel());

// Portuguese
probers[41] = new SingleByteCharSetProber(new Iso_8859_1_PortugueseModel());
probers[42] = new SingleByteCharSetProber(new Iso_8859_9_PortugueseModel());
probers[43] = new SingleByteCharSetProber(new Iso_8859_15_PortugueseModel());
probers[44] = new SingleByteCharSetProber(new Windows_1252_PortugueseModel());
probers[42] = new SingleByteCharSetProber(new Iso_8859_1_PortugueseModel());
probers[43] = new SingleByteCharSetProber(new Iso_8859_9_PortugueseModel());
probers[44] = new SingleByteCharSetProber(new Iso_8859_15_PortugueseModel());
probers[45] = new SingleByteCharSetProber(new Windows_1252_PortugueseModel());

// Maltese
probers[45] = new SingleByteCharSetProber(new Iso_8859_3_MalteseModel());
probers[46] = new SingleByteCharSetProber(new Iso_8859_3_MalteseModel());

// Czech
probers[46] = new SingleByteCharSetProber(new Windows_1250_CzechModel());
probers[47] = new SingleByteCharSetProber(new Iso_8859_2_CzechModel());
probers[48] = new SingleByteCharSetProber(new Mac_Centraleurope_CzechModel());
probers[49] = new SingleByteCharSetProber(new Ibm852_CzechModel());
probers[47] = new SingleByteCharSetProber(new Windows_1250_CzechModel());
probers[48] = new SingleByteCharSetProber(new Iso_8859_2_CzechModel());
probers[49] = new SingleByteCharSetProber(new Mac_Centraleurope_CzechModel());
probers[50] = new SingleByteCharSetProber(new Ibm852_CzechModel());

// Slovak
probers[50] = new SingleByteCharSetProber(new Windows_1250_SlovakModel());
probers[51] = new SingleByteCharSetProber(new Iso_8859_2_SlovakModel());
probers[52] = new SingleByteCharSetProber(new Mac_Centraleurope_SlovakModel());
probers[53] = new SingleByteCharSetProber(new Ibm852_SlovakModel());
probers[51] = new SingleByteCharSetProber(new Windows_1250_SlovakModel());
probers[52] = new SingleByteCharSetProber(new Iso_8859_2_SlovakModel());
probers[53] = new SingleByteCharSetProber(new Mac_Centraleurope_SlovakModel());
probers[54] = new SingleByteCharSetProber(new Ibm852_SlovakModel());

// Polish
probers[54] = new SingleByteCharSetProber(new Windows_1250_PolishModel());
probers[55] = new SingleByteCharSetProber(new Iso_8859_2_PolishModel());
probers[56] = new SingleByteCharSetProber(new Iso_8859_13_PolishModel());
probers[57] = new SingleByteCharSetProber(new Iso_8859_16_PolishModel());
probers[58] = new SingleByteCharSetProber(new Mac_Centraleurope_PolishModel());
probers[59] = new SingleByteCharSetProber(new Ibm852_PolishModel());
probers[55] = new SingleByteCharSetProber(new Windows_1250_PolishModel());
probers[56] = new SingleByteCharSetProber(new Iso_8859_2_PolishModel());
probers[57] = new SingleByteCharSetProber(new Iso_8859_13_PolishModel());
probers[58] = new SingleByteCharSetProber(new Iso_8859_16_PolishModel());
probers[59] = new SingleByteCharSetProber(new Mac_Centraleurope_PolishModel());
probers[60] = new SingleByteCharSetProber(new Ibm852_PolishModel());

// Finnish
probers[60] = new SingleByteCharSetProber(new Iso_8859_1_FinnishModel());
probers[61] = new SingleByteCharSetProber(new Iso_8859_4_FinnishModel());
probers[62] = new SingleByteCharSetProber(new Iso_8859_9_FinnishModel());
probers[63] = new SingleByteCharSetProber(new Iso_8859_13_FinnishModel());
probers[64] = new SingleByteCharSetProber(new Iso_8859_15_FinnishModel());
probers[65] = new SingleByteCharSetProber(new Windows_1252_FinnishModel());
probers[61] = new SingleByteCharSetProber(new Iso_8859_1_FinnishModel());
probers[62] = new SingleByteCharSetProber(new Iso_8859_4_FinnishModel());
probers[63] = new SingleByteCharSetProber(new Iso_8859_9_FinnishModel());
probers[64] = new SingleByteCharSetProber(new Iso_8859_13_FinnishModel());
probers[65] = new SingleByteCharSetProber(new Iso_8859_15_FinnishModel());
probers[66] = new SingleByteCharSetProber(new Windows_1252_FinnishModel());

// Italian
probers[66] = new SingleByteCharSetProber(new Iso_8859_1_ItalianModel());
probers[67] = new SingleByteCharSetProber(new Iso_8859_3_ItalianModel());
probers[68] = new SingleByteCharSetProber(new Iso_8859_9_ItalianModel());
probers[69] = new SingleByteCharSetProber(new Iso_8859_15_ItalianModel());
probers[70] = new SingleByteCharSetProber(new Windows_1252_ItalianModel());
probers[67] = new SingleByteCharSetProber(new Iso_8859_1_ItalianModel());
probers[68] = new SingleByteCharSetProber(new Iso_8859_3_ItalianModel());
probers[69] = new SingleByteCharSetProber(new Iso_8859_9_ItalianModel());
probers[70] = new SingleByteCharSetProber(new Iso_8859_15_ItalianModel());
probers[71] = new SingleByteCharSetProber(new Windows_1252_ItalianModel());

// Croatian
probers[71] = new SingleByteCharSetProber(new Windows_1250_CroatianModel());
probers[72] = new SingleByteCharSetProber(new Iso_8859_2_CroatianModel());
probers[73] = new SingleByteCharSetProber(new Iso_8859_13_CroatianModel());
probers[74] = new SingleByteCharSetProber(new Iso_8859_16_CroatianModel());
probers[75] = new SingleByteCharSetProber(new Mac_Centraleurope_CroatianModel());
probers[76] = new SingleByteCharSetProber(new Ibm852_CroatianModel());
probers[72] = new SingleByteCharSetProber(new Windows_1250_CroatianModel());
probers[73] = new SingleByteCharSetProber(new Iso_8859_2_CroatianModel());
probers[74] = new SingleByteCharSetProber(new Iso_8859_13_CroatianModel());
probers[75] = new SingleByteCharSetProber(new Iso_8859_16_CroatianModel());
probers[76] = new SingleByteCharSetProber(new Mac_Centraleurope_CroatianModel());
probers[77] = new SingleByteCharSetProber(new Ibm852_CroatianModel());

// Estonian
probers[77] = new SingleByteCharSetProber(new Windows_1252_EstonianModel());
probers[78] = new SingleByteCharSetProber(new Windows_1257_EstonianModel());
probers[79] = new SingleByteCharSetProber(new Iso_8859_4_EstonianModel());
probers[80] = new SingleByteCharSetProber(new Iso_8859_13_EstonianModel());
probers[81] = new SingleByteCharSetProber(new Iso_8859_15_EstonianModel());
probers[78] = new SingleByteCharSetProber(new Windows_1252_EstonianModel());
probers[79] = new SingleByteCharSetProber(new Windows_1257_EstonianModel());
probers[80] = new SingleByteCharSetProber(new Iso_8859_4_EstonianModel());
probers[81] = new SingleByteCharSetProber(new Iso_8859_13_EstonianModel());
probers[82] = new SingleByteCharSetProber(new Iso_8859_15_EstonianModel());

// Irish
probers[82] = new SingleByteCharSetProber(new Iso_8859_1_IrishModel());
probers[83] = new SingleByteCharSetProber(new Iso_8859_9_IrishModel());
probers[84] = new SingleByteCharSetProber(new Iso_8859_15_IrishModel());
probers[85] = new SingleByteCharSetProber(new Windows_1252_IrishModel());
probers[83] = new SingleByteCharSetProber(new Iso_8859_1_IrishModel());
probers[84] = new SingleByteCharSetProber(new Iso_8859_9_IrishModel());
probers[85] = new SingleByteCharSetProber(new Iso_8859_15_IrishModel());
probers[86] = new SingleByteCharSetProber(new Windows_1252_IrishModel());

// Romanian
probers[86] = new SingleByteCharSetProber(new Windows_1250_RomanianModel());
probers[87] = new SingleByteCharSetProber(new Iso_8859_2_RomanianModel());
probers[88] = new SingleByteCharSetProber(new Iso_8859_16_RomanianModel());
probers[89] = new SingleByteCharSetProber(new Ibm852_RomanianModel());
probers[87] = new SingleByteCharSetProber(new Windows_1250_RomanianModel());
probers[88] = new SingleByteCharSetProber(new Iso_8859_2_RomanianModel());
probers[89] = new SingleByteCharSetProber(new Iso_8859_16_RomanianModel());
probers[90] = new SingleByteCharSetProber(new Ibm852_RomanianModel());

// Slovene
probers[90] = new SingleByteCharSetProber(new Windows_1250_SloveneModel());
probers[91] = new SingleByteCharSetProber(new Iso_8859_2_SloveneModel());
probers[92] = new SingleByteCharSetProber(new Iso_8859_16_SloveneModel());
probers[93] = new SingleByteCharSetProber(new Mac_Centraleurope_SloveneModel());
probers[94] = new SingleByteCharSetProber(new Ibm852_SloveneModel());
probers[91] = new SingleByteCharSetProber(new Windows_1250_SloveneModel());
probers[92] = new SingleByteCharSetProber(new Iso_8859_2_SloveneModel());
probers[93] = new SingleByteCharSetProber(new Iso_8859_16_SloveneModel());
probers[94] = new SingleByteCharSetProber(new Mac_Centraleurope_SloveneModel());
probers[95] = new SingleByteCharSetProber(new Ibm852_SloveneModel());

// Swedish
probers[95] = new SingleByteCharSetProber(new Iso_8859_1_SwedishModel());
probers[96] = new SingleByteCharSetProber(new Iso_8859_4_SwedishModel());
probers[97] = new SingleByteCharSetProber(new Iso_8859_9_SwedishModel());
probers[98] = new SingleByteCharSetProber(new Iso_8859_15_SwedishModel());
probers[99] = new SingleByteCharSetProber(new Windows_1252_SwedishModel());
probers[96] = new SingleByteCharSetProber(new Iso_8859_1_SwedishModel());
probers[97] = new SingleByteCharSetProber(new Iso_8859_4_SwedishModel());
probers[98] = new SingleByteCharSetProber(new Iso_8859_9_SwedishModel());
probers[99] = new SingleByteCharSetProber(new Iso_8859_15_SwedishModel());
probers[100] = new SingleByteCharSetProber(new Windows_1252_SwedishModel());

Reset();
}
Expand Down Expand Up @@ -410,4 +411,4 @@ public override string GetCharsetName()
return probers[bestGuess].GetCharsetName();
}
}
}
}
3 changes: 3 additions & 0 deletions tests/Data/ibm850/lang_de_ibm850.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Auch gibt es niemanden, der den Schmerz an sich liebt, sucht oder w�nscht, nur, weil er Schmerz ist, es sei denn, es kommt zu zuf�lligen Umst�nden, in denen M�hen und Schmerz ihm gro�e Freude bereiten k�nnen.
Um ein triviales Beispiel zu nehmen, wer von uns unterzieht sich je anstrengender k�rperlicher Bet�tigung, au�er um Vorteile daraus zu ziehen?
Aber wer hat irgend ein Recht, einen Menschen zu tadeln, der die Entscheidung trifft, eine Freude zu genie�en, die keine unangenehmen Folgen hat, oder einen, der Schmerz vermeidet, welcher keine daraus resultierende Freude nach sich zieht?
4 changes: 4 additions & 0 deletions tests/UTF-unknown.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,8 @@
<ProjectReference Include="..\src\UTF-unknown.csproj" />
</ItemGroup>

<ItemGroup>
<Folder Include="Data\ibm850\" />
</ItemGroup>

</Project>