diff --git a/build.gradle b/build.gradle index 5927c61..4881ca8 100644 --- a/build.gradle +++ b/build.gradle @@ -38,12 +38,13 @@ android { lintOptions { disable 'IconDensities', 'ContentDescription', 'OldTargetApi', - 'NonConstantResourceId', 'ExpiredTargetSdkVersion', 'MediaCapabilities', - 'FindViewByIdCast', 'WrongViewCast' + 'NonConstantResourceId', 'ExpiredTargetSdkVersion', + 'MediaCapabilities', 'FindViewByIdCast', 'WrongViewCast' // abortOnError false } } dependencies { implementation 'org.commonmark:commonmark:0.18.0' + implementation 'com.github.albfernandez:juniversalchardet:2.4.0' } diff --git a/src/main/java/org/billthefarmer/editor/Editor.java b/src/main/java/org/billthefarmer/editor/Editor.java index 1c3746f..2984a28 100644 --- a/src/main/java/org/billthefarmer/editor/Editor.java +++ b/src/main/java/org/billthefarmer/editor/Editor.java @@ -75,14 +75,16 @@ import android.widget.TextView; import android.support.v4.content.FileProvider; - +/* import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; - +*/ import org.commonmark.node.*; import org.commonmark.parser.Parser; import org.commonmark.renderer.html.HtmlRenderer; +import org.mozilla.universalchardet.CharsetDetector; + import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -931,7 +933,7 @@ public boolean onPrepareOptionsMenu(Menu menu) // Get the charsets Set keySet = Charset.availableCharsets().keySet(); - String charsets[] = CharsetDetector.getAllDetectableCharsets(); + String charsets[] = CharsetDetector.CHARSETS; // Get the submenu MenuItem item = menu.findItem(R.id.charset); item.setTitle(match); @@ -3116,12 +3118,12 @@ private CharSequence readFile(File file) BufferedReader(new InputStreamReader(in)); // Default UTF-8 charset - String charset = "UTF-8"; + Charset charset = Charset.forName("UTF-8"); if (match != null) - charset = match; + charset = Charset.forName(match); // Detect charset, using hint - Reader detect = new CharsetDetector().getReader(in, charset); + Reader detect = CharsetDetector.createBufferedReader(in, charset); if (detect != null) reader = new BufferedReader(detect); @@ -3291,7 +3293,7 @@ protected CharSequence doInBackground(Uri... uris) // Create reader BufferedReader reader = new BufferedReader(new InputStreamReader(in)); - +/* // Default UTF-8 charset String charset = "UTF-8"; if (editor.match != null) @@ -3301,17 +3303,23 @@ protected CharSequence doInBackground(Uri... uris) CharsetMatch match = new CharsetDetector().setDeclaredEncoding(charset) .setText(in).detect(); +*/ + int size = (int) FileUtils.getSize(editor, uris[0], null, null); + in.mark (size); + String match = CharsetDetector.detectCharset(in); if (match != null) { - editor.match = match.getName(); + editor.match = match; editor.runOnUiThread(() -> editor.getActionBar().setSubtitle(editor.match)); - reader = new BufferedReader(match.getReader()); + reader = new BufferedReader + (new InputStreamReader(in, match)); + // match.getReader()); } if (BuildConfig.DEBUG && match != null) - Log.d(TAG, "Charset " + match.getName()); + Log.d(TAG, "Charset " + match); String line; while ((line = reader.readLine()) != null) diff --git a/src/main/java/org/mozilla/universalchardet/CharsetDetector.java b/src/main/java/org/mozilla/universalchardet/CharsetDetector.java new file mode 100644 index 0000000..d204970 --- /dev/null +++ b/src/main/java/org/mozilla/universalchardet/CharsetDetector.java @@ -0,0 +1,114 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// Editor - Text editor for Android +// +// Copyright © 2021 Bill Farmer +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// +// Bill Farmer william j farmer [at] yahoo [dot] co [dot] uk. +// +//////////////////////////////////////////////////////////////////////////////// + +package org.mozilla.universalchardet; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; + +import java.nio.charset.Charset; + +public class CharsetDetector +{ + private CharsetDetector() {} + + /** + * Array of supported charsets + */ + public final static String[] CHARSETS = + { + Constants.CHARSET_ISO_2022_JP, + Constants.CHARSET_ISO_2022_CN, + Constants.CHARSET_ISO_2022_KR, + Constants.CHARSET_ISO_8859_5, + Constants.CHARSET_ISO_8859_7, + Constants.CHARSET_ISO_8859_8, + Constants.CHARSET_BIG5, + Constants.CHARSET_GB18030, + Constants.CHARSET_EUC_JP, + Constants.CHARSET_EUC_KR, + Constants.CHARSET_EUC_TW, + Constants.CHARSET_SHIFT_JIS, + Constants.CHARSET_IBM855, + Constants.CHARSET_IBM866, + Constants.CHARSET_KOI8_R, + Constants.CHARSET_MACCYRILLIC, + Constants.CHARSET_WINDOWS_1251, + Constants.CHARSET_WINDOWS_1252, + Constants.CHARSET_WINDOWS_1253, + Constants.CHARSET_WINDOWS_1255, + Constants.CHARSET_UTF_8, + Constants.CHARSET_UTF_16BE, + Constants.CHARSET_UTF_16LE, + Constants.CHARSET_UTF_32BE, + Constants.CHARSET_UTF_32LE, + Constants.CHARSET_TIS620, + Constants.CHARSET_US_ASCCI + }; + + /** + * Gets the charset of content from InputStream. + * + * @param inputStream InputStream containing text file + * @return The charset of the file, null if cannot be determined + * @throws IOException if some IO error occurs + */ + public static String detectCharset(InputStream inputStream) + throws IOException + { + String encoding = UniversalDetector.detectCharset(inputStream); + inputStream.reset(); + return encoding; + } + + /** + * Create a reader from a file with correct encoding + * @param inputStream The stream to read from + * @param defaultCharset defaultCharset to use if can't be determined + * @return BufferedReader for the file with the correct encoding + * @throws java.io.IOException if some I/O error ocurrs + */ + public static BufferedReader createBufferedReader(InputStream inputStream, + Charset defaultCharset) + throws IOException + { + Charset cs = (defaultCharset == null)? + Charset.forName("UTF-8"): defaultCharset; + + String detectedEncoding = detectCharset(inputStream); + + if (detectedEncoding != null) + cs = Charset.forName(detectedEncoding); + + if (!cs.name().contains("UTF")) + return new BufferedReader + (new InputStreamReader(inputStream, cs)); + + return new BufferedReader + (new InputStreamReader + (new UnicodeBOMInputStream(inputStream), cs)); + } +}