From ca6f55b38200921a6409f5c244c8e9e7e0841fc5 Mon Sep 17 00:00:00 2001 From: Bill Farmer Date: Sat, 1 Jan 2022 18:29:49 +0000 Subject: [PATCH] Update charset detector --- .../com/ibm/icu/text/CharsetDetector.java | 586 -------- .../java/com/ibm/icu/text/CharsetMatch.java | 247 ---- .../com/ibm/icu/text/CharsetRecog_2022.java | 173 --- .../com/ibm/icu/text/CharsetRecog_UTF8.java | 99 -- .../ibm/icu/text/CharsetRecog_Unicode.java | 212 --- .../com/ibm/icu/text/CharsetRecog_mbcs.java | 562 -------- .../com/ibm/icu/text/CharsetRecog_sbcs.java | 1253 ----------------- .../com/ibm/icu/text/CharsetRecognizer.java | 52 - .../java/org/billthefarmer/editor/Editor.java | 42 +- .../universalchardet/CharsetDetector.java | 6 +- 10 files changed, 9 insertions(+), 3223 deletions(-) delete mode 100644 src/main/java/com/ibm/icu/text/CharsetDetector.java delete mode 100644 src/main/java/com/ibm/icu/text/CharsetMatch.java delete mode 100644 src/main/java/com/ibm/icu/text/CharsetRecog_2022.java delete mode 100644 src/main/java/com/ibm/icu/text/CharsetRecog_UTF8.java delete mode 100644 src/main/java/com/ibm/icu/text/CharsetRecog_Unicode.java delete mode 100644 src/main/java/com/ibm/icu/text/CharsetRecog_mbcs.java delete mode 100644 src/main/java/com/ibm/icu/text/CharsetRecog_sbcs.java delete mode 100644 src/main/java/com/ibm/icu/text/CharsetRecognizer.java diff --git a/src/main/java/com/ibm/icu/text/CharsetDetector.java b/src/main/java/com/ibm/icu/text/CharsetDetector.java deleted file mode 100644 index 1416055..0000000 --- a/src/main/java/com/ibm/icu/text/CharsetDetector.java +++ /dev/null @@ -1,586 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/** -******************************************************************************* -* Copyright (C) 2005-2016, International Business Machines Corporation and * -* others. All Rights Reserved. * -******************************************************************************* -*/ -package com.ibm.icu.text; - -import java.io.IOException; -import java.io.InputStream; -import java.io.Reader; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - - -/** - * CharsetDetector provides a facility for detecting the - * charset or encoding of character data in an unknown format. - * The input data can either be from an input stream or an array of bytes. - * The result of the detection operation is a list of possibly matching - * charsets, or, for simple use, you can just ask for a Java Reader that - * will will work over the input data. - *

- * Character set detection is at best an imprecise operation. The detection - * process will attempt to identify the charset that best matches the characteristics - * of the byte data, but the process is partly statistical in nature, and - * the results can not be guaranteed to always be correct. - *

- * For best accuracy in charset detection, the input data should be primarily - * in a single language, and a minimum of a few hundred bytes worth of plain text - * in the language are needed. The detection process will attempt to - * ignore html or xml style markup that could otherwise obscure the content. - *

- * @stable ICU 3.4 - */ -public class CharsetDetector { - -// Question: Should we have getters corresponding to the setters for input text -// and declared encoding? - -// A thought: If we were to create our own type of Java Reader, we could defer -// figuring out an actual charset for data that starts out with too much English -// only ASCII until the user actually read through to something that didn't look -// like 7 bit English. If nothing else ever appeared, we would never need to -// actually choose the "real" charset. All assuming that the application just -// wants the data, and doesn't care about a char set name. - - /** - * Constructor - * - * @stable ICU 3.4 - */ - public CharsetDetector() { - } - - /** - * Set the declared encoding for charset detection. - * The declared encoding of an input text is an encoding obtained - * from an http header or xml declaration or similar source that - * can be provided as additional information to the charset detector. - * A match between a declared encoding and a possible detected encoding - * will raise the quality of that detected encoding by a small delta, - * and will also appear as a "reason" for the match. - *

- * A declared encoding that is incompatible with the input data being - * analyzed will not be added to the list of possible encodings. - * - * @param encoding The declared encoding - * - * @stable ICU 3.4 - */ - public CharsetDetector setDeclaredEncoding(String encoding) { - fDeclaredEncoding = encoding; - return this; - } - - /** - * Set the input text (byte) data whose charset is to be detected. - * - * @param in the input text of unknown encoding - * - * @return This CharsetDetector - * - * @stable ICU 3.4 - */ - public CharsetDetector setText(byte [] in) { - fRawInput = in; - fRawLength = in.length; - - return this; - } - - private static final int kBufSize = 8000; - - /** - * Set the input text (byte) data whose charset is to be detected. - *

- * The input stream that supplies the character data must have markSupported() - * == true; the charset detection process will read a small amount of data, - * then return the stream to its original position via - * the InputStream.reset() operation. The exact amount that will - * be read depends on the characteristics of the data itself. - * - * @param in the input text of unknown encoding - * - * @return This CharsetDetector - * - * @stable ICU 3.4 - */ - - public CharsetDetector setText(InputStream in) throws IOException { - fInputStream = in; - fInputStream.mark(kBufSize); - fRawInput = new byte[kBufSize]; // Always make a new buffer because the - // previous one may have come from the caller, - // in which case we can't touch it. - fRawLength = 0; - int remainingLength = kBufSize; - while (remainingLength > 0 ) { - // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop. - int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength); - if (bytesRead <= 0) { - break; - } - fRawLength += bytesRead; - remainingLength -= bytesRead; - } - fInputStream.reset(); - - return this; - } - - - /** - * Return the charset that best matches the supplied input data. - * - * Note though, that because the detection - * only looks at the start of the input data, - * there is a possibility that the returned charset will fail to handle - * the full set of input data. - *

- * Raise an exception if - *

- * - * @return a CharsetMatch object representing the best matching charset, or - * null if there are no matches. - * - * @stable ICU 3.4 - */ - public CharsetMatch detect() { -// TODO: A better implementation would be to copy the detect loop from -// detectAll(), and cut it short as soon as a match with a high confidence -// is found. This is something to be done later, after things are otherwise -// working. - CharsetMatch matches[] = detectAll(); - - if (matches == null || matches.length == 0) { - return null; - } - - return matches[0]; - } - - /** - * Return an array of all charsets that appear to be plausible - * matches with the input data. The array is ordered with the - * best quality match first. - *

- * Raise an exception if - *

- * - * @return An array of CharsetMatch objects representing possibly matching charsets. - * - * @stable ICU 3.4 - */ - public CharsetMatch[] detectAll() { - ArrayList matches = new ArrayList(); - - MungeInput(); // Strip html markup, collect byte stats. - - // Iterate over all possible charsets, remember all that - // give a match quality > 0. - for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { - CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i); - boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled; - if (active) { - CharsetMatch m = rcinfo.recognizer.match(this); - if (m != null) { - matches.add(m); - } - } - } - Collections.sort(matches); // CharsetMatch compares on confidence - Collections.reverse(matches); // Put best match first. - CharsetMatch [] resultArray = new CharsetMatch[matches.size()]; - resultArray = matches.toArray(resultArray); - return resultArray; - } - - - /** - * Autodetect the charset of an inputStream, and return a Java Reader - * to access the converted input data. - *

- * This is a convenience method that is equivalent to - * this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader(); - *

- * For the input stream that supplies the character data, markSupported() - * must be true; the charset detection will read a small amount of data, - * then return the stream to its original position via - * the InputStream.reset() operation. The exact amount that will - * be read depends on the characteristics of the data itself. - *

- * Raise an exception if no charsets appear to match the input data. - * - * @param in The source of the byte data in the unknown charset. - * - * @param declaredEncoding A declared encoding for the data, if available, - * or null or an empty string if none is available. - * - * @stable ICU 3.4 - */ - public Reader getReader(InputStream in, String declaredEncoding) { - fDeclaredEncoding = declaredEncoding; - - try { - setText(in); - - CharsetMatch match = detect(); - - if (match == null) { - return null; - } - - return match.getReader(); - } catch (IOException e) { - return null; - } - } - - /** - * Autodetect the charset of an inputStream, and return a String - * containing the converted input data. - *

- * This is a convenience method that is equivalent to - * this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString(); - *

- * Raise an exception if no charsets appear to match the input data. - * - * @param in The source of the byte data in the unknown charset. - * - * @param declaredEncoding A declared encoding for the data, if available, - * or null or an empty string if none is available. - * - * @stable ICU 3.4 - */ - public String getString(byte[] in, String declaredEncoding) - { - fDeclaredEncoding = declaredEncoding; - - try { - setText(in); - - CharsetMatch match = detect(); - - if (match == null) { - return null; - } - - return match.getString(-1); - } catch (IOException e) { - return null; - } - } - - - /** - * Get the names of all charsets supported by CharsetDetector class. - *

- * Note: Multiple different charset encodings in a same family may use - * a single shared name in this implementation. For example, this method returns - * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" - * (Windows Latin 1). However, actual detection result could be "windows-1252" - * when the input data matches Latin 1 code points with any points only available - * in "windows-1252". - * - * @return an array of the names of all charsets supported by - * CharsetDetector class. - * - * @stable ICU 3.4 - */ - public static String[] getAllDetectableCharsets() { - String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()]; - for (int i = 0; i < allCharsetNames.length; i++) { - allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName(); - } - return allCharsetNames; - } - - /** - * Test whether or not input filtering is enabled. - * - * @return true if input text will be filtered. - * - * @see #enableInputFilter - * - * @stable ICU 3.4 - */ - public boolean inputFilterEnabled() - { - return fStripTags; - } - - /** - * Enable filtering of input text. If filtering is enabled, - * text within angle brackets ("<" and ">") will be removed - * before detection. - * - * @param filter true to enable input text filtering. - * - * @return The previous setting. - * - * @stable ICU 3.4 - */ - public boolean enableInputFilter(boolean filter) - { - boolean previous = fStripTags; - - fStripTags = filter; - - return previous; - } - - /* - * MungeInput - after getting a set of raw input data to be analyzed, preprocess - * it by removing what appears to be html markup. - */ - private void MungeInput() { - int srci = 0; - int dsti = 0; - byte b; - boolean inMarkup = false; - int openTags = 0; - int badTags = 0; - - // - // html / xml markup stripping. - // quick and dirty, not 100% accurate, but hopefully good enough, statistically. - // discard everything within < brackets > - // Count how many total '<' and illegal (nested) '<' occur, so we can make some - // guess as to whether the input was actually marked up at all. - if (fStripTags) { - for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) { - b = fRawInput[srci]; - if (b == (byte)'<') { - if (inMarkup) { - badTags++; - } - inMarkup = true; - openTags++; - } - - if (! inMarkup) { - fInputBytes[dsti++] = b; - } - - if (b == (byte)'>') { - inMarkup = false; - } - } - - fInputLen = dsti; - } - - // - // If it looks like this input wasn't marked up, or if it looks like it's - // essentially nothing but markup abandon the markup stripping. - // Detection will have to work on the unstripped input. - // - if (openTags<5 || openTags/5 < badTags || - (fInputLen < 100 && fRawLength>600)) { - int limit = fRawLength; - - if (limit > kBufSize) { - limit = kBufSize; - } - - for (srci=0; srci ALL_CS_RECOGNIZERS; - - static { - List list = new ArrayList(); - - list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true)); - - list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true)); - - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true)); - - // IBM 420/424 recognizers are disabled by default - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false)); - list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false)); - - ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list); - } - - /** - * Get the names of charsets that can be recognized by this CharsetDetector instance. - * - * @return an array of the names of charsets that can be recognized by this CharsetDetector - * instance. - * - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - public String[] getDetectableCharsets() { - List csnames = new ArrayList(ALL_CS_RECOGNIZERS.size()); - for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { - CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i); - boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i]; - if (active) { - csnames.add(rcinfo.recognizer.getName()); - } - } - return csnames.toArray(new String[csnames.size()]); - } - - /** - * Enable or disable individual charset encoding. - * A name of charset encoding must be included in the names returned by - * {@link #getAllDetectableCharsets()}. - * - * @param encoding the name of charset encoding. - * @param enabled true to enable, or false to disable the - * charset encoding. - * @return A reference to this CharsetDetector. - * @throws IllegalArgumentException when the name of charset encoding is - * not supported. - * - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - public CharsetDetector setDetectableCharset(String encoding, boolean enabled) { - int modIdx = -1; - boolean isDefaultVal = false; - for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { - CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i); - if (csrinfo.recognizer.getName().equals(encoding)) { - modIdx = i; - isDefaultVal = (csrinfo.isDefaultEnabled == enabled); - break; - } - } - if (modIdx < 0) { - // No matching encoding found - throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\""); - } - - if (fEnabledRecognizers == null && !isDefaultVal) { - // Create an array storing the non default setting - fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()]; - - // Initialize the array with default info - for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { - fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled; - } - } - - if (fEnabledRecognizers != null) { - fEnabledRecognizers[modIdx] = enabled; - } - - return this; - } -} diff --git a/src/main/java/com/ibm/icu/text/CharsetMatch.java b/src/main/java/com/ibm/icu/text/CharsetMatch.java deleted file mode 100644 index fc8dcd0..0000000 --- a/src/main/java/com/ibm/icu/text/CharsetMatch.java +++ /dev/null @@ -1,247 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/** -******************************************************************************* -* Copyright (C) 2005-2016, International Business Machines Corporation and * -* others. All Rights Reserved. * -******************************************************************************* -*/ -package com.ibm.icu.text; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; - - -/** - * This class represents a charset that has been identified by a CharsetDetector - * as a possible encoding for a set of input data. From an instance of this - * class, you can ask for a confidence level in the charset identification, - * or for Java Reader or String to access the original byte data in Unicode form. - *

- * Instances of this class are created only by CharsetDetectors. - *

- * Note: this class has a natural ordering that is inconsistent with equals. - * The natural ordering is based on the match confidence value. - * - * @stable ICU 3.4 - */ -public class CharsetMatch implements Comparable { - - - /** - * Create a java.io.Reader for reading the Unicode character data corresponding - * to the original byte data supplied to the Charset detect operation. - *

- * CAUTION: if the source of the byte data was an InputStream, a Reader - * can be created for only one matching char set using this method. If more - * than one charset needs to be tried, the caller will need to reset - * the InputStream and create InputStreamReaders itself, based on the charset name. - * - * @return the Reader for the Unicode character data. - * - * @stable ICU 3.4 - */ - public Reader getReader() { - InputStream inputStream = fInputStream; - - if (inputStream == null) { - inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength); - } - - try { - inputStream.reset(); - return new InputStreamReader(inputStream, getName()); - } catch (IOException e) { - return null; - } - } - - /** - * Create a Java String from Unicode character data corresponding - * to the original byte data supplied to the Charset detect operation. - * - * @return a String created from the converted input data. - * - * @stable ICU 3.4 - */ - public String getString() throws java.io.IOException { - return getString(-1); - - } - - /** - * Create a Java String from Unicode character data corresponding - * to the original byte data supplied to the Charset detect operation. - * The length of the returned string is limited to the specified size; - * the string will be trunctated to this length if necessary. A limit value of - * zero or less is ignored, and treated as no limit. - * - * @param maxLength The maximum length of the String to be created when the - * source of the data is an input stream, or -1 for - * unlimited length. - * @return a String created from the converted input data. - * - * @stable ICU 3.4 - */ - public String getString(int maxLength) throws java.io.IOException { - String result = null; - if (fInputStream != null) { - StringBuilder sb = new StringBuilder(); - char[] buffer = new char[1024]; - Reader reader = getReader(); - int max = maxLength < 0? Integer.MAX_VALUE : maxLength; - int bytesRead = 0; - - while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) { - sb.append(buffer, 0, bytesRead); - max -= bytesRead; - } - - reader.close(); - - return sb.toString(); - } else { - String name = getName(); - /* - * getName() may return a name with a suffix 'rtl' or 'ltr'. This cannot - * be used to open a charset (e.g. IBM424_rtl). The ending '_rtl' or 'ltr' - * should be stripped off before creating the string. - */ - int startSuffix = name.indexOf("_rtl") < 0 ? name.indexOf("_ltr") : name.indexOf("_rtl"); - if (startSuffix > 0) { - name = name.substring(0, startSuffix); - } - result = new String(fRawInput, name); - } - return result; - - } - - /** - * Get an indication of the confidence in the charset detected. - * Confidence values range from 0-100, with larger numbers indicating - * a better match of the input data to the characteristics of the - * charset. - * - * @return the confidence in the charset match - * - * @stable ICU 3.4 - */ - public int getConfidence() { - return fConfidence; - } - - /** - * Get the name of the detected charset. - * The name will be one that can be used with other APIs on the - * platform that accept charset names. It is the "Canonical name" - * as defined by the class java.nio.charset.Charset; for - * charsets that are registered with the IANA charset registry, - * this is the MIME-preferred registerd name. - * - * @see java.nio.charset.Charset - * @see java.io.InputStreamReader - * - * @return The name of the charset. - * - * @stable ICU 3.4 - */ - public String getName() { - return fCharsetName; - } - - /** - * Get the ISO code for the language of the detected charset. - * - * @return The ISO code for the language or null if the language cannot be determined. - * - * @stable ICU 3.4 - */ - public String getLanguage() { - return fLang; - } - - /** - * Compare to other CharsetMatch objects. - * Comparison is based on the match confidence value, which - * allows CharsetDetector.detectAll() to order its results. - * - * @param other the CharsetMatch object to compare against. - * @return a negative integer, zero, or a positive integer as the - * confidence level of this CharsetMatch - * is less than, equal to, or greater than that of - * the argument. - * @throws ClassCastException if the argument is not a CharsetMatch. - * @stable ICU 4.4 - */ - @Override - public int compareTo (CharsetMatch other) { - int compareResult = 0; - if (this.fConfidence > other.fConfidence) { - compareResult = 1; - } else if (this.fConfidence < other.fConfidence) { - compareResult = -1; - } - return compareResult; - } - - /* - * Constructor. Implementation internal - */ - CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) { - fConfidence = conf; - - // The references to the original application input data must be copied out - // of the charset recognizer to here, in case the application resets the - // recognizer before using this CharsetMatch. - if (det.fInputStream == null) { - // We only want the existing input byte data if it came straight from the user, - // not if is just the head of a stream. - fRawInput = det.fRawInput; - fRawLength = det.fRawLength; - } - fInputStream = det.fInputStream; - fCharsetName = rec.getName(); - fLang = rec.getLanguage(); - } - - /* - * Constructor. Implementation internal - */ - CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf, String csName, String lang) { - fConfidence = conf; - - // The references to the original application input data must be copied out - // of the charset recognizer to here, in case the application resets the - // recognizer before using this CharsetMatch. - if (det.fInputStream == null) { - // We only want the existing input byte data if it came straight from the user, - // not if is just the head of a stream. - fRawInput = det.fRawInput; - fRawLength = det.fRawLength; - } - fInputStream = det.fInputStream; - fCharsetName = csName; - fLang = lang; - } - - - // - // Private Data - // - private int fConfidence; - private byte[] fRawInput = null; // Original, untouched input bytes. - // If user gave us a byte array, this is it. - private int fRawLength; // Length of data in fRawInput array. - - private InputStream fInputStream = null; // User's input stream, or null if the user - // gave us a byte array. - - private String fCharsetName; // The name of the charset this CharsetMatch - // represents. Filled in by the recognizer. - private String fLang; // The language, if one was determined by - // the recognizer during the detect operation. -} diff --git a/src/main/java/com/ibm/icu/text/CharsetRecog_2022.java b/src/main/java/com/ibm/icu/text/CharsetRecog_2022.java deleted file mode 100644 index 3046b4d..0000000 --- a/src/main/java/com/ibm/icu/text/CharsetRecog_2022.java +++ /dev/null @@ -1,173 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -******************************************************************************* -* Copyright (C) 2005 - 2012, International Business Machines Corporation and * -* others. All Rights Reserved. * -******************************************************************************* -*/ -package com.ibm.icu.text; - -/** - * class CharsetRecog_2022 part of the ICU charset detection implementation. - * This is a superclass for the individual detectors for - * each of the detectable members of the ISO 2022 family - * of encodings. - * - * The separate classes are nested within this class. - */ -abstract class CharsetRecog_2022 extends CharsetRecognizer { - - - /** - * Matching function shared among the 2022 detectors JP, CN and KR - * Counts up the number of legal an unrecognized escape sequences in - * the sample of text, and computes a score based on the total number & - * the proportion that fit the encoding. - * - * - * @param text the byte buffer containing text to analyse - * @param textLen the size of the text in the byte. - * @param escapeSequences the byte escape sequences to test for. - * @return match quality, in the range of 0-100. - */ - int match(byte [] text, int textLen, byte [][] escapeSequences) { - int i, j; - int escN; - int hits = 0; - int misses = 0; - int shifts = 0; - int quality; - scanInput: - for (i=0; i= 3 && - (input[0] & 0xFF) == 0xef && (input[1] & 0xFF) == 0xbb && (input[2] & 0xFF) == 0xbf) { - hasBOM = true; - } - - // Scan for multi-byte sequences - for (i=0; i=det.fRawLength) { - break; - } - b = input[i]; - if ((b & 0xc0) != 0x080) { - numInvalid++; - break; - } - if (--trailBytes == 0) { - numValid++; - break; - } - } - } - - // Cook up some sort of confidence score, based on presence of a BOM - // and the existence of valid and/or invalid multi-byte sequences. - confidence = 0; - if (hasBOM && numInvalid==0) { - confidence = 100; - } else if (hasBOM && numValid > numInvalid*10) { - confidence = 80; - } else if (numValid > 3 && numInvalid == 0) { - confidence = 100; - } else if (numValid > 0 && numInvalid == 0) { - confidence = 80; - } else if (numValid == 0 && numInvalid == 0) { - // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which - // accepts ASCII with confidence = 10. - // TODO: add plain ASCII as an explicitly detected type. - confidence = 15; - } else if (numValid > numInvalid*10) { - // Probably corrupt utf-8 data. Valid sequences aren't likely by chance. - confidence = 25; - } - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - -} diff --git a/src/main/java/com/ibm/icu/text/CharsetRecog_Unicode.java b/src/main/java/com/ibm/icu/text/CharsetRecog_Unicode.java deleted file mode 100644 index ce9827d..0000000 --- a/src/main/java/com/ibm/icu/text/CharsetRecog_Unicode.java +++ /dev/null @@ -1,212 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* - ******************************************************************************* - * Copyright (C) 1996-2013, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* - * - */ - -package com.ibm.icu.text; - -/** - * This class matches UTF-16 and UTF-32, both big- and little-endian. The - * BOM will be used if it is present. - */ -abstract class CharsetRecog_Unicode extends CharsetRecognizer { - - /* (non-Javadoc) - * @see com.ibm.icu.text.CharsetRecognizer#getName() - */ - @Override - abstract String getName(); - - /* (non-Javadoc) - * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector) - */ - @Override - abstract CharsetMatch match(CharsetDetector det); - - static int codeUnit16FromBytes(byte hi, byte lo) { - return ((hi & 0xff) << 8) | (lo & 0xff); - } - - // UTF-16 confidence calculation. Very simple minded, but better than nothing. - // Any 8 bit non-control characters bump the confidence up. These have a zero high byte, - // and are very likely to be UTF-16, although they could also be part of a UTF-32 code. - // NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32. - // NULs should be rare in actual text. - static int adjustConfidence(int codeUnit, int confidence) { - if (codeUnit == 0) { - confidence -= 10; - } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) { - confidence += 10; - } - if (confidence < 0) { - confidence = 0; - } else if (confidence > 100) { - confidence = 100; - } - return confidence; - } - - static class CharsetRecog_UTF_16_BE extends CharsetRecog_Unicode - { - @Override - String getName() - { - return "UTF-16BE"; - } - - @Override - CharsetMatch match(CharsetDetector det) - { - byte[] input = det.fRawInput; - int confidence = 10; - - int bytesToCheck = Math.min(input.length, 30); - for (int charIndex=0; charIndex 0) { - return new CharsetMatch(det, this, confidence); - } - return null; - } - } - - static class CharsetRecog_UTF_16_LE extends CharsetRecog_Unicode - { - @Override - String getName() - { - return "UTF-16LE"; - } - - @Override - CharsetMatch match(CharsetDetector det) - { - byte[] input = det.fRawInput; - int confidence = 10; - - int bytesToCheck = Math.min(input.length, 30); - for (int charIndex=0; charIndex 0) { - return new CharsetMatch(det, this, confidence); - } - return null; - } - } - - static abstract class CharsetRecog_UTF_32 extends CharsetRecog_Unicode - { - abstract int getChar(byte[] input, int index); - - @Override - abstract String getName(); - - @Override - CharsetMatch match(CharsetDetector det) - { - byte[] input = det.fRawInput; - int limit = (det.fRawLength / 4) * 4; - int numValid = 0; - int numInvalid = 0; - boolean hasBOM = false; - int confidence = 0; - - if (limit==0) { - return null; - } - if (getChar(input, 0) == 0x0000FEFF) { - hasBOM = true; - } - - for(int i = 0; i < limit; i += 4) { - int ch = getChar(input, i); - - if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { - numInvalid += 1; - } else { - numValid += 1; - } - } - - - // Cook up some sort of confidence score, based on presence of a BOM - // and the existence of valid and/or invalid multi-byte sequences. - if (hasBOM && numInvalid==0) { - confidence = 100; - } else if (hasBOM && numValid > numInvalid*10) { - confidence = 80; - } else if (numValid > 3 && numInvalid == 0) { - confidence = 100; - } else if (numValid > 0 && numInvalid == 0) { - confidence = 80; - } else if (numValid > numInvalid*10) { - // Probably corrupt UTF-32BE data. Valid sequences aren't likely by chance. - confidence = 25; - } - - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - } - - static class CharsetRecog_UTF_32_BE extends CharsetRecog_UTF_32 - { - @Override - int getChar(byte[] input, int index) - { - return (input[index + 0] & 0xFF) << 24 | (input[index + 1] & 0xFF) << 16 | - (input[index + 2] & 0xFF) << 8 | (input[index + 3] & 0xFF); - } - - @Override - String getName() - { - return "UTF-32BE"; - } - } - - - static class CharsetRecog_UTF_32_LE extends CharsetRecog_UTF_32 - { - @Override - int getChar(byte[] input, int index) - { - return (input[index + 3] & 0xFF) << 24 | (input[index + 2] & 0xFF) << 16 | - (input[index + 1] & 0xFF) << 8 | (input[index + 0] & 0xFF); - } - - @Override - String getName() - { - return "UTF-32LE"; - } - } -} diff --git a/src/main/java/com/ibm/icu/text/CharsetRecog_mbcs.java b/src/main/java/com/ibm/icu/text/CharsetRecog_mbcs.java deleted file mode 100644 index b4f0bc1..0000000 --- a/src/main/java/com/ibm/icu/text/CharsetRecog_mbcs.java +++ /dev/null @@ -1,562 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* - **************************************************************************** - * Copyright (C) 2005-2012, International Business Machines Corporation and * - * others. All Rights Reserved. * - **************************************************************************** - * - */ -package com.ibm.icu.text; - -import java.util.Arrays; - -/** - * CharsetRecognizer implementation for Asian - double or multi-byte - charsets. - * Match is determined mostly by the input data adhering to the - * encoding scheme for the charset, and, optionally, - * frequency-of-occurrence of characters. - *

- * Instances of this class are singletons, one per encoding - * being recognized. They are created in the main - * CharsetDetector class and kept in the global list of available - * encodings to be checked. The specific encoding being recognized - * is determined by subclass. - */ -abstract class CharsetRecog_mbcs extends CharsetRecognizer { - - /** - * Get the IANA name of this charset. - * @return the charset name. - */ - @Override - abstract String getName() ; - - - /** - * Test the match of this charset with the input text data - * which is obtained via the CharsetDetector object. - * - * @param det The CharsetDetector, which contains the input text - * to be checked for being in this charset. - * @return Two values packed into one int (Damn java, anyhow) - *
- * bits 0-7: the match confidence, ranging from 0-100 - *
- * bits 8-15: The match reason, an enum-like value. - */ - int match(CharsetDetector det, int [] commonChars) { - @SuppressWarnings("unused") - int singleByteCharCount = 0; //TODO Do we really need this? - int doubleByteCharCount = 0; - int commonCharCount = 0; - int badCharCount = 0; - int totalCharCount = 0; - int confidence = 0; - iteratedChar iter = new iteratedChar(); - - detectBlock: { - for (iter.reset(); nextChar(iter, det);) { - totalCharCount++; - if (iter.error) { - badCharCount++; - } else { - long cv = iter.charValue & 0xFFFFFFFFL; - - if (cv <= 0xff) { - singleByteCharCount++; - } else { - doubleByteCharCount++; - if (commonChars != null) { - // NOTE: This assumes that there are no 4-byte common chars. - if (Arrays.binarySearch(commonChars, (int) cv) >= 0) { - commonCharCount++; - } - } - } - } - if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { - // Bail out early if the byte data is not matching the encoding scheme. - break detectBlock; - } - } - - if (doubleByteCharCount <= 10 && badCharCount== 0) { - // Not many multi-byte chars. - if (doubleByteCharCount == 0 && totalCharCount < 10) { - // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. - // We don't have enough data to have any confidence. - // Statistical analysis of single byte non-ASCII characters would probably help here. - confidence = 0; - } - else { - // ASCII or ISO file? It's probably not our encoding, - // but is not incompatible with our encoding, so don't give it a zero. - confidence = 10; - } - - break detectBlock; - } - - // - // No match if there are too many characters that don't fit the encoding scheme. - // (should we have zero tolerance for these?) - // - if (doubleByteCharCount < 20*badCharCount) { - confidence = 0; - break detectBlock; - } - - if (commonChars == null) { - // We have no statistics on frequently occurring characters. - // Assess confidence purely on having a reasonable number of - // multi-byte characters (the more the better - confidence = 30 + doubleByteCharCount - 20*badCharCount; - if (confidence > 100) { - confidence = 100; - } - }else { - // - // Frequency of occurrence statistics exist. - // - double maxVal = Math.log((float)doubleByteCharCount / 4); - double scaleFactor = 90.0 / maxVal; - confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10); - confidence = Math.min(confidence, 100); - } - } // end of detectBlock: - - return confidence; - } - - // "Character" iterated character class. - // Recognizers for specific mbcs encodings make their "characters" available - // by providing a nextChar() function that fills in an instance of iteratedChar - // with the next char from the input. - // The returned characters are not converted to Unicode, but remain as the raw - // bytes (concatenated into an int) from the codepage data. - // - // For Asian charsets, use the raw input rather than the input that has been - // stripped of markup. Detection only considers multi-byte chars, effectively - // stripping markup anyway, and double byte chars do occur in markup too. - // - static class iteratedChar { - int charValue = 0; // 1-4 bytes from the raw input data - int nextIndex = 0; - boolean error = false; - boolean done = false; - - void reset() { - charValue = 0; - nextIndex = 0; - error = false; - done = false; - } - - int nextByte(CharsetDetector det) { - if (nextIndex >= det.fRawLength) { - done = true; - return -1; - } - int byteValue = det.fRawInput[nextIndex++] & 0x00ff; - return byteValue; - } - } - - /** - * Get the next character (however many bytes it is) from the input data - * Subclasses for specific charset encodings must implement this function - * to get characters according to the rules of their encoding scheme. - * - * This function is not a method of class iteratedChar only because - * that would require a lot of extra derived classes, which is awkward. - * @param it The iteratedChar "struct" into which the returned char is placed. - * @param det The charset detector, which is needed to get at the input byte data - * being iterated over. - * @return True if a character was returned, false at end of input. - */ - abstract boolean nextChar(iteratedChar it, CharsetDetector det); - - - - - - /** - * Shift-JIS charset recognizer. - * - */ - static class CharsetRecog_sjis extends CharsetRecog_mbcs { - static int [] commonChars = - // TODO: This set of data comes from the character frequency- - // of-occurrence analysis tool. The data needs to be moved - // into a resource and loaded from there. - {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, - 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, - 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, - 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, - 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, - 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; - - @Override - boolean nextChar(iteratedChar it, CharsetDetector det) { - it.error = false; - int firstByte; - firstByte = it.charValue = it.nextByte(det); - if (firstByte < 0) { - return false; - } - - if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) { - return true; - } - - int secondByte = it.nextByte(det); - if (secondByte < 0) { - return false; - } - it.charValue = (firstByte << 8) | secondByte; - if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) { - // Illegal second byte value. - it.error = true; - } - return true; - } - - @Override - CharsetMatch match(CharsetDetector det) { - int confidence = match(det, commonChars); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - - @Override - String getName() { - return "Shift_JIS"; - } - - @Override - public String getLanguage() - { - return "ja"; - } - - - } - - - /** - * Big5 charset recognizer. - * - */ - static class CharsetRecog_big5 extends CharsetRecog_mbcs { - static int [] commonChars = - // TODO: This set of data comes from the character frequency- - // of-occurrence analysis tool. The data needs to be moved - // into a resource and loaded from there. - {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, - 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, - 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, - 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, - 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, - 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, - 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, - 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, - 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, - 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; - - @Override - boolean nextChar(iteratedChar it, CharsetDetector det) { - it.error = false; - int firstByte; - firstByte = it.charValue = it.nextByte(det); - if (firstByte < 0) { - return false; - } - - if (firstByte <= 0x7f || firstByte==0xff) { - // single byte character. - return true; - } - - int secondByte = it.nextByte(det); - if (secondByte < 0) { - return false; - } - it.charValue = (it.charValue << 8) | secondByte; - - if (secondByte < 0x40 || - secondByte ==0x7f || - secondByte == 0xff) { - it.error = true; - } - return true; - } - - @Override - CharsetMatch match(CharsetDetector det) { - int confidence = match(det, commonChars); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - - @Override - String getName() { - return "Big5"; - } - - - @Override - public String getLanguage() - { - return "zh"; - } - } - - - /** - * EUC charset recognizers. One abstract class that provides the common function - * for getting the next character according to the EUC encoding scheme, - * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. - * - */ - abstract static class CharsetRecog_euc extends CharsetRecog_mbcs { - - /* - * (non-Javadoc) - * Get the next character value for EUC based encodings. - * Character "value" is simply the raw bytes that make up the character - * packed into an int. - */ - @Override - boolean nextChar(iteratedChar it, CharsetDetector det) { - it.error = false; - int firstByte = 0; - int secondByte = 0; - int thirdByte = 0; - //int fourthByte = 0; - - buildChar: { - firstByte = it.charValue = it.nextByte(det); - if (firstByte < 0) { - // Ran off the end of the input data - it.done = true; - break buildChar; - } - if (firstByte <= 0x8d) { - // single byte char - break buildChar; - } - - secondByte = it.nextByte(det); - it.charValue = (it.charValue << 8) | secondByte; - - if (firstByte >= 0xA1 && firstByte <= 0xfe) { - // Two byte Char - if (secondByte < 0xa1) { - it.error = true; - } - break buildChar; - } - if (firstByte == 0x8e) { - // Code Set 2. - // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. - // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. - // We don't know which we've got. - // Treat it like EUC-JP. If the data really was EUC-TW, the following two - // bytes will look like a well formed 2 byte char. - if (secondByte < 0xa1) { - it.error = true; - } - break buildChar; - } - - if (firstByte == 0x8f) { - // Code set 3. - // Three byte total char size, two bytes of actual char value. - thirdByte = it.nextByte(det); - it.charValue = (it.charValue << 8) | thirdByte; - if (thirdByte < 0xa1) { - it.error = true; - } - } - } - - return (it.done == false); - } - - /** - * The charset recognize for EUC-JP. A singleton instance of this class - * is created and kept by the public CharsetDetector class - */ - static class CharsetRecog_euc_jp extends CharsetRecog_euc { - static int [] commonChars = - // TODO: This set of data comes from the character frequency- - // of-occurrence analysis tool. The data needs to be moved - // into a resource and loaded from there. - {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, - 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, - 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, - 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, - 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, - 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, - 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, - 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, - 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, - 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; - @Override - String getName() { - return "EUC-JP"; - } - - @Override - CharsetMatch match(CharsetDetector det) { - int confidence = match(det, commonChars); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - - @Override - public String getLanguage() - { - return "ja"; - } - } - - /** - * The charset recognize for EUC-KR. A singleton instance of this class - * is created and kept by the public CharsetDetector class - */ - static class CharsetRecog_euc_kr extends CharsetRecog_euc { - static int [] commonChars = - // TODO: This set of data comes from the character frequency- - // of-occurrence analysis tool. The data needs to be moved - // into a resource and loaded from there. - {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, - 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, - 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, - 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, - 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, - 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, - 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, - 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, - 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, - 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; - - @Override - String getName() { - return "EUC-KR"; - } - - @Override - CharsetMatch match(CharsetDetector det) { - int confidence = match(det, commonChars); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - - @Override - public String getLanguage() - { - return "ko"; - } - } - } - - /** - * - * GB-18030 recognizer. Uses simplified Chinese statistics. - * - */ - static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs { - - /* - * (non-Javadoc) - * Get the next character value for EUC based encodings. - * Character "value" is simply the raw bytes that make up the character - * packed into an int. - */ - @Override - boolean nextChar(iteratedChar it, CharsetDetector det) { - it.error = false; - int firstByte = 0; - int secondByte = 0; - int thirdByte = 0; - int fourthByte = 0; - - buildChar: { - firstByte = it.charValue = it.nextByte(det); - - if (firstByte < 0) { - // Ran off the end of the input data - it.done = true; - break buildChar; - } - - if (firstByte <= 0x80) { - // single byte char - break buildChar; - } - - secondByte = it.nextByte(det); - it.charValue = (it.charValue << 8) | secondByte; - - if (firstByte >= 0x81 && firstByte <= 0xFE) { - // Two byte Char - if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) { - break buildChar; - } - - // Four byte char - if (secondByte >= 0x30 && secondByte <= 0x39) { - thirdByte = it.nextByte(det); - - if (thirdByte >= 0x81 && thirdByte <= 0xFE) { - fourthByte = it.nextByte(det); - - if (fourthByte >= 0x30 && fourthByte <= 0x39) { - it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte; - break buildChar; - } - } - } - - it.error = true; - break buildChar; - } - } - - return (it.done == false); - } - - static int [] commonChars = - // TODO: This set of data comes from the character frequency- - // of-occurrence analysis tool. The data needs to be moved - // into a resource and loaded from there. - {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, - 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, - 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, - 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, - 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, - 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, - 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, - 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, - 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, - 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; - - - @Override - String getName() { - return "GB18030"; - } - - @Override - CharsetMatch match(CharsetDetector det) { - int confidence = match(det, commonChars); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - - @Override - public String getLanguage() - { - return "zh"; - } - } - - -} diff --git a/src/main/java/com/ibm/icu/text/CharsetRecog_sbcs.java b/src/main/java/com/ibm/icu/text/CharsetRecog_sbcs.java deleted file mode 100644 index f352d5e..0000000 --- a/src/main/java/com/ibm/icu/text/CharsetRecog_sbcs.java +++ /dev/null @@ -1,1253 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* - **************************************************************************** - * Copyright (C) 2005-2013, International Business Machines Corporation and * - * others. All Rights Reserved. * - ************************************************************************** * - * - */ - -package com.ibm.icu.text; - -/** - * This class recognizes single-byte encodings. Because the encoding scheme is so - * simple, language statistics are used to do the matching. - */ -abstract class CharsetRecog_sbcs extends CharsetRecognizer { - - /* (non-Javadoc) - * @see com.ibm.icu.text.CharsetRecognizer#getName() - */ - @Override - abstract String getName(); - - static class NGramParser - { -// private static final int N_GRAM_SIZE = 3; - private static final int N_GRAM_MASK = 0xFFFFFF; - - protected int byteIndex = 0; - private int ngram = 0; - - private int[] ngramList; - protected byte[] byteMap; - - private int ngramCount; - private int hitCount; - - protected byte spaceChar; - - public NGramParser(int[] theNgramList, byte[] theByteMap) - { - ngramList = theNgramList; - byteMap = theByteMap; - - ngram = 0; - - ngramCount = hitCount = 0; - } - - /* - * Binary search for value in table, which must have exactly 64 entries. - */ - private static int search(int[] table, int value) - { - int index = 0; - - if (table[index + 32] <= value) { - index += 32; - } - - if (table[index + 16] <= value) { - index += 16; - } - - if (table[index + 8] <= value) { - index += 8; - } - - if (table[index + 4] <= value) { - index += 4; - } - - if (table[index + 2] <= value) { - index += 2; - } - - if (table[index + 1] <= value) { - index += 1; - } - - if (table[index] > value) { - index -= 1; - } - - if (index < 0 || table[index] != value) { - return -1; - } - - return index; - } - - private void lookup(int thisNgram) - { - ngramCount += 1; - - if (search(ngramList, thisNgram) >= 0) { - hitCount += 1; - } - - } - - protected void addByte(int b) - { - ngram = ((ngram << 8) + (b & 0xFF)) & N_GRAM_MASK; - lookup(ngram); - } - - private int nextByte(CharsetDetector det) - { - if (byteIndex >= det.fInputLen) { - return -1; - } - - return det.fInputBytes[byteIndex++] & 0xFF; - } - - protected void parseCharacters(CharsetDetector det) - { - int b; - boolean ignoreSpace = false; - - while ((b = nextByte(det)) >= 0) { - byte mb = byteMap[b]; - - // TODO: 0x20 might not be a space in all character sets... - if (mb != 0) { - if (!(mb == spaceChar && ignoreSpace)) { - addByte(mb); - } - - ignoreSpace = (mb == spaceChar); - } - } - - } - - public int parse(CharsetDetector det) - { - return parse (det, (byte)0x20); - } - public int parse(CharsetDetector det, byte spaceCh) - { - - this.spaceChar = spaceCh; - - parseCharacters(det); - - // TODO: Is this OK? The buffer could have ended in the middle of a word... - addByte(spaceChar); - - double rawPercent = (double) hitCount / (double) ngramCount; - -// if (rawPercent <= 2.0) { -// return 0; -// } - - // TODO - This is a bit of a hack to take care of a case - // were we were getting a confidence of 135... - if (rawPercent > 0.33) { - return 98; - } - - return (int) (rawPercent * 300.0); - } - } - - static class NGramParser_IBM420 extends NGramParser - { - private byte alef = 0x00; - - protected static byte[] unshapeMap = { -/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ -/* 0- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 1- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 2- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 3- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 4- */ (byte) 0x40, (byte) 0x40, (byte) 0x42, (byte) 0x42, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x47, (byte) 0x49, (byte) 0x4A, (byte) 0x4B, (byte) 0x4C, (byte) 0x4D, (byte) 0x4E, (byte) 0x4F, -/* 5- */ (byte) 0x50, (byte) 0x49, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x56, (byte) 0x58, (byte) 0x58, (byte) 0x5A, (byte) 0x5B, (byte) 0x5C, (byte) 0x5D, (byte) 0x5E, (byte) 0x5F, -/* 6- */ (byte) 0x60, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x63, (byte) 0x65, (byte) 0x65, (byte) 0x67, (byte) 0x67, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, -/* 7- */ (byte) 0x69, (byte) 0x71, (byte) 0x71, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, (byte) 0x77, (byte) 0x79, (byte) 0x7A, (byte) 0x7B, (byte) 0x7C, (byte) 0x7D, (byte) 0x7E, (byte) 0x7F, -/* 8- */ (byte) 0x80, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x80, (byte) 0x8B, (byte) 0x8B, (byte) 0x8D, (byte) 0x8D, (byte) 0x8F, -/* 9- */ (byte) 0x90, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x9A, (byte) 0x9A, (byte) 0x9A, (byte) 0x9A, (byte) 0x9E, (byte) 0x9E, -/* A- */ (byte) 0x9E, (byte) 0xA1, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x9E, (byte) 0xAB, (byte) 0xAB, (byte) 0xAD, (byte) 0xAD, (byte) 0xAF, -/* B- */ (byte) 0xAF, (byte) 0xB1, (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0xB6, (byte) 0xB7, (byte) 0xB8, (byte) 0xB9, (byte) 0xB1, (byte) 0xBB, (byte) 0xBB, (byte) 0xBD, (byte) 0xBD, (byte) 0xBF, -/* C- */ (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7, (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xBF, (byte) 0xCC, (byte) 0xBF, (byte) 0xCE, (byte) 0xCF, -/* D- */ (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7, (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDA, (byte) 0xDC, (byte) 0xDC, (byte) 0xDC, (byte) 0xDF, -/* E- */ (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, -/* F- */ (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF, - }; - - - public NGramParser_IBM420(int[] theNgramList, byte[] theByteMap) - { - super(theNgramList, theByteMap); - } - - private byte isLamAlef(byte b) { - if(b == (byte)0xb2 || b == (byte)0xb3){ - return (byte)0x47; - }else if(b == (byte)0xb4 || b == (byte)0xb5){ - return (byte)0x49; - }else if(b == (byte)0xb8 || b == (byte)0xb9){ - return (byte)0x56; - }else - return (byte)0x00; - } - - /* - * Arabic shaping needs to be done manually. Cannot call ArabicShaping class - * because CharsetDetector is dealing with bytes not Unicode code points. We could - * convert the bytes to Unicode code points but that would leave us dependent - * on CharsetICU which we try to avoid. IBM420 converter amongst different versions - * of JDK can produce different results and therefore is also avoided. - */ - private int nextByte(CharsetDetector det) - { - if (byteIndex >= det.fInputLen || det.fInputBytes[byteIndex] == 0) { - return -1; - } - int next; - - alef = isLamAlef(det.fInputBytes[byteIndex]); - if(alef != (byte)0x00) - next = 0xB1 & 0xFF; - else - next = unshapeMap[det.fInputBytes[byteIndex]& 0xFF] & 0xFF; - - byteIndex++; - - return next; - } - - @Override - protected void parseCharacters(CharsetDetector det) - { - int b; - boolean ignoreSpace = false; - - while ((b = nextByte(det)) >= 0) { - byte mb = byteMap[b]; - - // TODO: 0x20 might not be a space in all character sets... - if (mb != 0) { - if (!(mb == spaceChar && ignoreSpace)) { - addByte(mb); - } - - ignoreSpace = (mb == spaceChar); - } - if(alef != (byte)0x00){ - mb = byteMap[alef & 0xFF]; - - // TODO: 0x20 might not be a space in all character sets... - if (mb != 0) { - if (!(mb == spaceChar && ignoreSpace)) { - addByte(mb); - } - - ignoreSpace = (mb == spaceChar); - } - - } - } - } - } - - - int match(CharsetDetector det, int[] ngrams, byte[] byteMap) - { - return match (det, ngrams, byteMap, (byte)0x20); - } - - int match(CharsetDetector det, int[] ngrams, byte[] byteMap, byte spaceChar) - { - NGramParser parser = new NGramParser(ngrams, byteMap); - return parser.parse(det, spaceChar); - } - - int matchIBM420(CharsetDetector det, int[] ngrams, byte[] byteMap, byte spaceChar){ - NGramParser_IBM420 parser = new NGramParser_IBM420(ngrams, byteMap); - return parser.parse(det, spaceChar); - } - - static class NGramsPlusLang { - int[] fNGrams; - String fLang; - NGramsPlusLang(String la, int [] ng) { - fLang = la; - fNGrams = ng; - } - } - - static class CharsetRecog_8859_1 extends CharsetRecog_sbcs - { - protected static byte[] byteMap = { - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, - (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20, - (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xDF, - (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, - (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20, - (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF, - }; - - - private static NGramsPlusLang[] ngrams_8859_1 = new NGramsPlusLang[] { - new NGramsPlusLang( - "da", - new int[] { - 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620, - 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320, - 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520, - 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572, - }), - new NGramsPlusLang( - "de", - new int[] { - 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F, - 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220, - 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465, - 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572, - }), - new NGramsPlusLang( - "en", - new int[] { - 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F, - 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74, - 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420, - 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320, - }), - - new NGramsPlusLang( - "es", - new int[] { - 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365, - 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C, - 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064, - 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20, - }), - - new NGramsPlusLang( - "fr", - new int[] { - 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E, - 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20, - 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420, - 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220, - }), - - new NGramsPlusLang( - "it", - new int[] { - 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073, - 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220, - 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20, - 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F, - }), - - new NGramsPlusLang( - "nl", - new int[] { - 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665, - 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E, - 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F, - 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F, - }), - - new NGramsPlusLang( - "no", - new int[] { - 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469, - 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474, - 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65, - 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572, - }), - - new NGramsPlusLang( - "pt", - new int[] { - 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365, - 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20, - 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065, - 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F, - - }), - - new NGramsPlusLang( - "sv", - new int[] { - 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469, - 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220, - 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20, - 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220, - }), - - }; - - - @Override - public CharsetMatch match(CharsetDetector det) - { - String name = det.fC1Bytes ? "windows-1252" : "ISO-8859-1"; - int bestConfidenceSoFar = -1; - String lang = null; - for (NGramsPlusLang ngl: ngrams_8859_1) { - int confidence = match(det, ngl.fNGrams, byteMap); - if (confidence > bestConfidenceSoFar) { - bestConfidenceSoFar = confidence; - lang = ngl.fLang; - } - } - return bestConfidenceSoFar <= 0 ? null : new CharsetMatch(det, this, bestConfidenceSoFar, name, lang); - } - - - @Override - public String getName() - { - return "ISO-8859-1"; - } - } - - - static class CharsetRecog_8859_2 extends CharsetRecog_sbcs - { - protected static byte[] byteMap = { - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0xB1, (byte) 0x20, (byte) 0xB3, (byte) 0x20, (byte) 0xB5, (byte) 0xB6, (byte) 0x20, - (byte) 0x20, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0x20, (byte) 0xBE, (byte) 0xBF, - (byte) 0x20, (byte) 0xB1, (byte) 0x20, (byte) 0xB3, (byte) 0x20, (byte) 0xB5, (byte) 0xB6, (byte) 0xB7, - (byte) 0x20, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0x20, (byte) 0xBE, (byte) 0xBF, - (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, - (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20, - (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xDF, - (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, - (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20, - (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x20, - }; - - private static NGramsPlusLang[] ngrams_8859_2 = new NGramsPlusLang[] { - new NGramsPlusLang( - "cs", - new int[] { - 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F, - 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465, - 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865, - 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564, - }), - new NGramsPlusLang( - "hu", - new int[] { - 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69, - 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20, - 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061, - 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320, - }), - new NGramsPlusLang( - "pl", - new int[] { - 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779, - 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20, - 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769, - 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720, - }), - new NGramsPlusLang( - "ro", - new int[] { - 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69, - 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070, - 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72, - 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20, - }) - }; - - @Override - public CharsetMatch match(CharsetDetector det) - { - String name = det.fC1Bytes ? "windows-1250" : "ISO-8859-2"; - int bestConfidenceSoFar = -1; - String lang = null; - for (NGramsPlusLang ngl: ngrams_8859_2) { - int confidence = match(det, ngl.fNGrams, byteMap); - if (confidence > bestConfidenceSoFar) { - bestConfidenceSoFar = confidence; - lang = ngl.fLang; - } - } - return bestConfidenceSoFar <= 0 ? null : new CharsetMatch(det, this, bestConfidenceSoFar, name, lang); - } - - @Override - public String getName() - { - return "ISO-8859-2"; - } - - } - - - abstract static class CharsetRecog_8859_5 extends CharsetRecog_sbcs - { - protected static byte[] byteMap = { - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, - (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0x20, (byte) 0xFE, (byte) 0xFF, - (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7, - (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF, - (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, - (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7, - (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF, - (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, - (byte) 0x20, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, - (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0x20, (byte) 0xFE, (byte) 0xFF, - }; - - @Override - public String getName() - { - return "ISO-8859-5"; - } - } - - static class CharsetRecog_8859_5_ru extends CharsetRecog_8859_5 - { - private static int[] ngrams = { - 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE, - 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD, - 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2, - 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520, - }; - - @Override - public String getLanguage() - { - return "ru"; - } - - @Override - public CharsetMatch match(CharsetDetector det) - { - int confidence = match(det, ngrams, byteMap); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - } - - abstract static class CharsetRecog_8859_6 extends CharsetRecog_sbcs - { - protected static byte[] byteMap = { - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7, - (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF, - (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7, - (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - }; - - @Override - public String getName() - { - return "ISO-8859-6"; - } - } - - static class CharsetRecog_8859_6_ar extends CharsetRecog_8859_6 - { - private static int[] ngrams = { - 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8, - 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1, - 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20, - 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620, - }; - - @Override - public String getLanguage() - { - return "ar"; - } - - @Override - public CharsetMatch match(CharsetDetector det) - { - int confidence = match(det, ngrams, byteMap); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - } - - abstract static class CharsetRecog_8859_7 extends CharsetRecog_sbcs - { - protected static byte[] byteMap = { - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0xA1, (byte) 0xA2, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xDC, (byte) 0x20, - (byte) 0xDD, (byte) 0xDE, (byte) 0xDF, (byte) 0x20, (byte) 0xFC, (byte) 0x20, (byte) 0xFD, (byte) 0xFE, - (byte) 0xC0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, - (byte) 0xF0, (byte) 0xF1, (byte) 0x20, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, - (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF, - (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, - (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, - (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x20, - }; - - @Override - public String getName() - { - return "ISO-8859-7"; - } - } - - static class CharsetRecog_8859_7_el extends CharsetRecog_8859_7 - { - private static int[] ngrams = { - 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7, - 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120, - 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5, - 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20, - }; - - @Override - public String getLanguage() - { - return "el"; - } - - @Override - public CharsetMatch match(CharsetDetector det) - { - String name = det.fC1Bytes ? "windows-1253" : "ISO-8859-7"; - int confidence = match(det, ngrams, byteMap); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "el"); - } - } - - abstract static class CharsetRecog_8859_8 extends CharsetRecog_sbcs - { - protected static byte[] byteMap = { - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, - (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, - (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - }; - - @Override - public String getName() - { - return "ISO-8859-8"; - } - } - - static class CharsetRecog_8859_8_I_he extends CharsetRecog_8859_8 - { - private static int[] ngrams = { - 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0, - 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4, - 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE, - 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9, - }; - - @Override - public String getName() - { - return "ISO-8859-8-I"; - } - - @Override - public String getLanguage() - { - return "he"; - } - - @Override - public CharsetMatch match(CharsetDetector det) - { - String name = det.fC1Bytes ? "windows-1255" : "ISO-8859-8-I"; - int confidence = match(det, ngrams, byteMap); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "he"); - } - } - - static class CharsetRecog_8859_8_he extends CharsetRecog_8859_8 - { - private static int[] ngrams = { - 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0, - 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC, - 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920, - 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9, - }; - - @Override - public String getLanguage() - { - return "he"; - } - - @Override - public CharsetMatch match(CharsetDetector det) - { - String name = det.fC1Bytes ? "windows-1255" : "ISO-8859-8"; - int confidence = match(det, ngrams, byteMap); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "he"); - - } - } - - abstract static class CharsetRecog_8859_9 extends CharsetRecog_sbcs - { - protected static byte[] byteMap = { - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, - (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20, - (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0x69, (byte) 0xFE, (byte) 0xDF, - (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, - (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0x20, - (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF, - }; - - @Override - public String getName() - { - return "ISO-8859-9"; - } - } - - static class CharsetRecog_8859_9_tr extends CharsetRecog_8859_9 - { - private static int[] ngrams = { - 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961, - 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062, - 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062, - 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD, - }; - - @Override - public String getLanguage() - { - return "tr"; - } - - @Override - public CharsetMatch match(CharsetDetector det) - { - String name = det.fC1Bytes ? "windows-1254" : "ISO-8859-9"; - int confidence = match(det, ngrams, byteMap); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "tr"); - } - } - - static class CharsetRecog_windows_1251 extends CharsetRecog_sbcs - { - private static int[] ngrams = { - 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE, - 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED, - 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2, - 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520, - }; - - private static byte[] byteMap = { - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x90, (byte) 0x83, (byte) 0x20, (byte) 0x83, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x9A, (byte) 0x20, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F, - (byte) 0x90, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x9A, (byte) 0x20, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F, - (byte) 0x20, (byte) 0xA2, (byte) 0xA2, (byte) 0xBC, (byte) 0x20, (byte) 0xB4, (byte) 0x20, (byte) 0x20, - (byte) 0xB8, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xBF, - (byte) 0x20, (byte) 0x20, (byte) 0xB3, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0x20, (byte) 0x20, - (byte) 0xB8, (byte) 0x20, (byte) 0xBA, (byte) 0x20, (byte) 0xBC, (byte) 0xBE, (byte) 0xBE, (byte) 0xBF, - (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, - (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, - (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF, - (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, - (byte) 0xF0, (byte) 0xF1, (byte) 0xF2, (byte) 0xF3, (byte) 0xF4, (byte) 0xF5, (byte) 0xF6, (byte) 0xF7, - (byte) 0xF8, (byte) 0xF9, (byte) 0xFA, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0xFF, - }; - - @Override - public String getName() - { - return "windows-1251"; - } - - @Override - public String getLanguage() - { - return "ru"; - } - - @Override - public CharsetMatch match(CharsetDetector det) - { - int confidence = match(det, ngrams, byteMap); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - } - - static class CharsetRecog_windows_1256 extends CharsetRecog_sbcs - { - private static int[] ngrams = { - 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8, - 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD, - 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20, - 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420, - }; - - private static byte[] byteMap = { - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x81, (byte) 0x20, (byte) 0x83, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x88, (byte) 0x20, (byte) 0x8A, (byte) 0x20, (byte) 0x9C, (byte) 0x8D, (byte) 0x8E, (byte) 0x8F, - (byte) 0x90, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x98, (byte) 0x20, (byte) 0x9A, (byte) 0x20, (byte) 0x9C, (byte) 0x20, (byte) 0x20, (byte) 0x9F, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0xAA, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xB5, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7, - (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF, - (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0x20, - (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF, - (byte) 0xE0, (byte) 0xE1, (byte) 0xE2, (byte) 0xE3, (byte) 0xE4, (byte) 0xE5, (byte) 0xE6, (byte) 0xE7, - (byte) 0xE8, (byte) 0xE9, (byte) 0xEA, (byte) 0xEB, (byte) 0xEC, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xF4, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0xF9, (byte) 0x20, (byte) 0xFB, (byte) 0xFC, (byte) 0x20, (byte) 0x20, (byte) 0xFF, - }; - - @Override - public String getName() - { - return "windows-1256"; - } - - @Override - public String getLanguage() - { - return "ar"; - } - - @Override - public CharsetMatch match(CharsetDetector det) - { - int confidence = match(det, ngrams, byteMap); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - } - - static class CharsetRecog_KOI8_R extends CharsetRecog_sbcs - { - private static int[] ngrams = { - 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1, - 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE, - 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1, - 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF, - }; - - private static byte[] byteMap = { - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x00, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x61, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, - (byte) 0x68, (byte) 0x69, (byte) 0x6A, (byte) 0x6B, (byte) 0x6C, (byte) 0x6D, (byte) 0x6E, (byte) 0x6F, - (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, - (byte) 0x78, (byte) 0x79, (byte) 0x7A, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xA3, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0xA3, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, (byte) 0x20, - (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7, - (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF, - (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7, - (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF, - (byte) 0xC0, (byte) 0xC1, (byte) 0xC2, (byte) 0xC3, (byte) 0xC4, (byte) 0xC5, (byte) 0xC6, (byte) 0xC7, - (byte) 0xC8, (byte) 0xC9, (byte) 0xCA, (byte) 0xCB, (byte) 0xCC, (byte) 0xCD, (byte) 0xCE, (byte) 0xCF, - (byte) 0xD0, (byte) 0xD1, (byte) 0xD2, (byte) 0xD3, (byte) 0xD4, (byte) 0xD5, (byte) 0xD6, (byte) 0xD7, - (byte) 0xD8, (byte) 0xD9, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF, - }; - - @Override - public String getName() - { - return "KOI8-R"; - } - - @Override - public String getLanguage() - { - return "ru"; - } - - @Override - public CharsetMatch match(CharsetDetector det) - { - int confidence = match(det, ngrams, byteMap); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - } - - abstract static class CharsetRecog_IBM424_he extends CharsetRecog_sbcs - { - protected static byte[] byteMap = { -/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ -/* 0- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 1- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 2- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 3- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 4- */ (byte) 0x40, (byte) 0x41, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x48, (byte) 0x49, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 5- */ (byte) 0x40, (byte) 0x51, (byte) 0x52, (byte) 0x53, (byte) 0x54, (byte) 0x55, (byte) 0x56, (byte) 0x57, (byte) 0x58, (byte) 0x59, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 6- */ (byte) 0x40, (byte) 0x40, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, (byte) 0x68, (byte) 0x69, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 7- */ (byte) 0x40, (byte) 0x71, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x00, (byte) 0x40, (byte) 0x40, -/* 8- */ (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 9- */ (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* A- */ (byte) 0xA0, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* B- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* C- */ (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* D- */ (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* E- */ (byte) 0x40, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* F- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, - }; - - @Override - public String getLanguage() - { - return "he"; - } - } - static class CharsetRecog_IBM424_he_rtl extends CharsetRecog_IBM424_he - { - @Override - public String getName() - { - return "IBM424_rtl"; - } - private static int[] ngrams = { - 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641, - 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045, - 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056, - 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069, - }; - @Override - public CharsetMatch match(CharsetDetector det) - { - int confidence = match(det, ngrams, byteMap, (byte)0x40); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - } - static class CharsetRecog_IBM424_he_ltr extends CharsetRecog_IBM424_he - { - @Override - public String getName() - { - return "IBM424_ltr"; - } - private static int[] ngrams = { - 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141, - 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054, - 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940, - 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651 - - }; - @Override - public CharsetMatch match(CharsetDetector det) - { - int confidence = match(det, ngrams, byteMap, (byte)0x40); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - } - - abstract static class CharsetRecog_IBM420_ar extends CharsetRecog_sbcs - { - - protected static byte[] byteMap = { -/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */ -/* 0- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 1- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 2- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 3- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 4- */ (byte) 0x40, (byte) 0x40, (byte) 0x42, (byte) 0x43, (byte) 0x44, (byte) 0x45, (byte) 0x46, (byte) 0x47, (byte) 0x48, (byte) 0x49, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 5- */ (byte) 0x40, (byte) 0x51, (byte) 0x52, (byte) 0x40, (byte) 0x40, (byte) 0x55, (byte) 0x56, (byte) 0x57, (byte) 0x58, (byte) 0x59, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 6- */ (byte) 0x40, (byte) 0x40, (byte) 0x62, (byte) 0x63, (byte) 0x64, (byte) 0x65, (byte) 0x66, (byte) 0x67, (byte) 0x68, (byte) 0x69, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 7- */ (byte) 0x70, (byte) 0x71, (byte) 0x72, (byte) 0x73, (byte) 0x74, (byte) 0x75, (byte) 0x76, (byte) 0x77, (byte) 0x78, (byte) 0x79, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, -/* 8- */ (byte) 0x80, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x8A, (byte) 0x8B, (byte) 0x8C, (byte) 0x8D, (byte) 0x8E, (byte) 0x8F, -/* 9- */ (byte) 0x90, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0x9A, (byte) 0x9B, (byte) 0x9C, (byte) 0x9D, (byte) 0x9E, (byte) 0x9F, -/* A- */ (byte) 0xA0, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0xAA, (byte) 0xAB, (byte) 0xAC, (byte) 0xAD, (byte) 0xAE, (byte) 0xAF, -/* B- */ (byte) 0xB0, (byte) 0xB1, (byte) 0xB2, (byte) 0xB3, (byte) 0xB4, (byte) 0xB5, (byte) 0x40, (byte) 0x40, (byte) 0xB8, (byte) 0xB9, (byte) 0xBA, (byte) 0xBB, (byte) 0xBC, (byte) 0xBD, (byte) 0xBE, (byte) 0xBF, -/* C- */ (byte) 0x40, (byte) 0x81, (byte) 0x82, (byte) 0x83, (byte) 0x84, (byte) 0x85, (byte) 0x86, (byte) 0x87, (byte) 0x88, (byte) 0x89, (byte) 0x40, (byte) 0xCB, (byte) 0x40, (byte) 0xCD, (byte) 0x40, (byte) 0xCF, -/* D- */ (byte) 0x40, (byte) 0x91, (byte) 0x92, (byte) 0x93, (byte) 0x94, (byte) 0x95, (byte) 0x96, (byte) 0x97, (byte) 0x98, (byte) 0x99, (byte) 0xDA, (byte) 0xDB, (byte) 0xDC, (byte) 0xDD, (byte) 0xDE, (byte) 0xDF, -/* E- */ (byte) 0x40, (byte) 0x40, (byte) 0xA2, (byte) 0xA3, (byte) 0xA4, (byte) 0xA5, (byte) 0xA6, (byte) 0xA7, (byte) 0xA8, (byte) 0xA9, (byte) 0xEA, (byte) 0xEB, (byte) 0x40, (byte) 0xED, (byte) 0xEE, (byte) 0xEF, -/* F- */ (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0x40, (byte) 0xFB, (byte) 0xFC, (byte) 0xFD, (byte) 0xFE, (byte) 0x40, - }; - - - @Override - public String getLanguage() - { - return "ar"; - } - - } - static class CharsetRecog_IBM420_ar_rtl extends CharsetRecog_IBM420_ar - { - private static int[] ngrams = { - 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158, - 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB, - 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40, - 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40, - }; - - @Override - public String getName() - { - return "IBM420_rtl"; - } - @Override - public CharsetMatch match(CharsetDetector det) - { - int confidence = matchIBM420(det, ngrams, byteMap, (byte)0x40); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - - } - static class CharsetRecog_IBM420_ar_ltr extends CharsetRecog_IBM420_ar - { - private static int[] ngrams = { - 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF, - 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD, - 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156, - 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156 - }; - - @Override - public String getName() - { - return "IBM420_ltr"; - } - @Override - public CharsetMatch match(CharsetDetector det) - { - int confidence = matchIBM420(det, ngrams, byteMap, (byte)0x40); - return confidence == 0 ? null : new CharsetMatch(det, this, confidence); - } - - } -} diff --git a/src/main/java/com/ibm/icu/text/CharsetRecognizer.java b/src/main/java/com/ibm/icu/text/CharsetRecognizer.java deleted file mode 100644 index d770403..0000000 --- a/src/main/java/com/ibm/icu/text/CharsetRecognizer.java +++ /dev/null @@ -1,52 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/** -******************************************************************************* -* Copyright (C) 2005-2012, International Business Machines Corporation and * -* others. All Rights Reserved. * -******************************************************************************* -*/ -package com.ibm.icu.text; - -/** - * Abstract class for recognizing a single charset. - * Part of the implementation of ICU's CharsetDetector. - * - * Each specific charset that can be recognized will have an instance - * of some subclass of this class. All interaction between the overall - * CharsetDetector and the stuff specific to an individual charset happens - * via the interface provided here. - * - * Instances of CharsetDetector DO NOT have or maintain - * state pertaining to a specific match or detect operation. - * The WILL be shared by multiple instances of CharsetDetector. - * They encapsulate const charset-specific information. - */ -abstract class CharsetRecognizer { - /** - * Get the IANA name of this charset. - * @return the charset name. - */ - abstract String getName(); - - /** - * Get the ISO language code for this charset. - * @return the language code, or null if the language cannot be determined. - */ - public String getLanguage() - { - return null; - } - - /** - * Test the match of this charset with the input text data - * which is obtained via the CharsetDetector object. - * - * @param det The CharsetDetector, which contains the input text - * to be checked for being in this charset. - * @return A CharsetMatch object containing details of match - * with this charset, or null if there was no match. - */ - abstract CharsetMatch match(CharsetDetector det); - -} diff --git a/src/main/java/org/billthefarmer/editor/Editor.java b/src/main/java/org/billthefarmer/editor/Editor.java index 2984a28..727c042 100644 --- a/src/main/java/org/billthefarmer/editor/Editor.java +++ b/src/main/java/org/billthefarmer/editor/Editor.java @@ -75,10 +75,7 @@ import android.widget.TextView; import android.support.v4.content.FileProvider; -/* -import com.ibm.icu.text.CharsetDetector; -import com.ibm.icu.text.CharsetMatch; -*/ + import org.commonmark.node.*; import org.commonmark.parser.Parser; import org.commonmark.renderer.html.HtmlRenderer; @@ -3110,23 +3107,10 @@ private CharSequence readFile(File file) { StringBuilder text = new StringBuilder(); // Open file - try (BufferedInputStream in = new - BufferedInputStream(new FileInputStream(file))) + try (BufferedReader reader = new BufferedReader + (new InputStreamReader + (new BufferedInputStream(new FileInputStream(file))))) { - // Create reader - BufferedReader reader = new - BufferedReader(new InputStreamReader(in)); - - // Default UTF-8 charset - Charset charset = Charset.forName("UTF-8"); - if (match != null) - charset = Charset.forName(match); - - // Detect charset, using hint - Reader detect = CharsetDetector.createBufferedReader(in, charset); - if (detect != null) - reader = new BufferedReader(detect); - String line; while ((line = reader.readLine()) != null) { @@ -3293,20 +3277,9 @@ protected CharSequence doInBackground(Uri... uris) // Create reader BufferedReader reader = new BufferedReader(new InputStreamReader(in)); -/* - // Default UTF-8 charset - String charset = "UTF-8"; - if (editor.match != null) - charset = editor.match; - - // Detect charset, using hint - CharsetMatch match = new - CharsetDetector().setDeclaredEncoding(charset) - .setText(in).detect(); -*/ - int size = (int) FileUtils.getSize(editor, uris[0], null, null); - in.mark (size); - String match = CharsetDetector.detectCharset(in); + + String match = CharsetDetector.detectCharset + (editor.getContentResolver().openInputStream(uris[0])); if (match != null) { @@ -3315,7 +3288,6 @@ protected CharSequence doInBackground(Uri... uris) editor.getActionBar().setSubtitle(editor.match)); reader = new BufferedReader (new InputStreamReader(in, match)); - // match.getReader()); } if (BuildConfig.DEBUG && match != null) diff --git a/src/main/java/org/mozilla/universalchardet/CharsetDetector.java b/src/main/java/org/mozilla/universalchardet/CharsetDetector.java index d204970..80b9171 100644 --- a/src/main/java/org/mozilla/universalchardet/CharsetDetector.java +++ b/src/main/java/org/mozilla/universalchardet/CharsetDetector.java @@ -79,9 +79,7 @@ private CharsetDetector() {} public static String detectCharset(InputStream inputStream) throws IOException { - String encoding = UniversalDetector.detectCharset(inputStream); - inputStream.reset(); - return encoding; + return UniversalDetector.detectCharset(inputStream); } /** @@ -89,7 +87,7 @@ public static String detectCharset(InputStream inputStream) * @param inputStream The stream to read from * @param defaultCharset defaultCharset to use if can't be determined * @return BufferedReader for the file with the correct encoding - * @throws java.io.IOException if some I/O error ocurrs + * @throws java.io.IOException if some I/O error occurs */ public static BufferedReader createBufferedReader(InputStream inputStream, Charset defaultCharset)