Skip to content

Commit

Permalink
Add mozilla character detector
Browse files Browse the repository at this point in the history
  • Loading branch information
billthefarmer committed Dec 31, 2021
1 parent 869acd9 commit 4898bdc
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 12 deletions.
5 changes: 3 additions & 2 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,13 @@ android {

lintOptions {
disable 'IconDensities', 'ContentDescription', 'OldTargetApi',
'NonConstantResourceId', 'ExpiredTargetSdkVersion', 'MediaCapabilities',
'FindViewByIdCast', 'WrongViewCast'
'NonConstantResourceId', 'ExpiredTargetSdkVersion',
'MediaCapabilities', 'FindViewByIdCast', 'WrongViewCast'
// abortOnError false
}
}

dependencies {
implementation 'org.commonmark:commonmark:0.18.0'
implementation 'com.github.albfernandez:juniversalchardet:2.4.0'
}
28 changes: 18 additions & 10 deletions src/main/java/org/billthefarmer/editor/Editor.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,16 @@
import android.widget.TextView;

import android.support.v4.content.FileProvider;

/*
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;

*/
import org.commonmark.node.*;
import org.commonmark.parser.Parser;
import org.commonmark.renderer.html.HtmlRenderer;

import org.mozilla.universalchardet.CharsetDetector;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
Expand Down Expand Up @@ -931,7 +933,7 @@ public boolean onPrepareOptionsMenu(Menu menu)

// Get the charsets
Set<String> keySet = Charset.availableCharsets().keySet();
String charsets[] = CharsetDetector.getAllDetectableCharsets();
String charsets[] = CharsetDetector.CHARSETS;
// Get the submenu
MenuItem item = menu.findItem(R.id.charset);
item.setTitle(match);
Expand Down Expand Up @@ -3116,12 +3118,12 @@ private CharSequence readFile(File file)
BufferedReader(new InputStreamReader(in));

// Default UTF-8 charset
String charset = "UTF-8";
Charset charset = Charset.forName("UTF-8");
if (match != null)
charset = match;
charset = Charset.forName(match);

// Detect charset, using hint
Reader detect = new CharsetDetector().getReader(in, charset);
Reader detect = CharsetDetector.createBufferedReader(in, charset);
if (detect != null)
reader = new BufferedReader(detect);

Expand Down Expand Up @@ -3291,7 +3293,7 @@ protected CharSequence doInBackground(Uri... uris)
// Create reader
BufferedReader reader = new
BufferedReader(new InputStreamReader(in));

/*
// Default UTF-8 charset
String charset = "UTF-8";
if (editor.match != null)
Expand All @@ -3301,17 +3303,23 @@ protected CharSequence doInBackground(Uri... uris)
CharsetMatch match = new
CharsetDetector().setDeclaredEncoding(charset)
.setText(in).detect();
*/
int size = (int) FileUtils.getSize(editor, uris[0], null, null);
in.mark (size);
String match = CharsetDetector.detectCharset(in);

if (match != null)
{
editor.match = match.getName();
editor.match = match;
editor.runOnUiThread(() ->
editor.getActionBar().setSubtitle(editor.match));
reader = new BufferedReader(match.getReader());
reader = new BufferedReader
(new InputStreamReader(in, match));
// match.getReader());
}

if (BuildConfig.DEBUG && match != null)
Log.d(TAG, "Charset " + match.getName());
Log.d(TAG, "Charset " + match);

String line;
while ((line = reader.readLine()) != null)
Expand Down
114 changes: 114 additions & 0 deletions src/main/java/org/mozilla/universalchardet/CharsetDetector.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
////////////////////////////////////////////////////////////////////////////////
//
// Editor - Text editor for Android
//
// Copyright © 2021 Bill Farmer
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
// Bill Farmer william j farmer [at] yahoo [dot] co [dot] uk.
//
////////////////////////////////////////////////////////////////////////////////

package org.mozilla.universalchardet;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import java.nio.charset.Charset;

public class CharsetDetector
{
private CharsetDetector() {}

/**
* Array of supported charsets
*/
public final static String[] CHARSETS =
{
Constants.CHARSET_ISO_2022_JP,
Constants.CHARSET_ISO_2022_CN,
Constants.CHARSET_ISO_2022_KR,
Constants.CHARSET_ISO_8859_5,
Constants.CHARSET_ISO_8859_7,
Constants.CHARSET_ISO_8859_8,
Constants.CHARSET_BIG5,
Constants.CHARSET_GB18030,
Constants.CHARSET_EUC_JP,
Constants.CHARSET_EUC_KR,
Constants.CHARSET_EUC_TW,
Constants.CHARSET_SHIFT_JIS,
Constants.CHARSET_IBM855,
Constants.CHARSET_IBM866,
Constants.CHARSET_KOI8_R,
Constants.CHARSET_MACCYRILLIC,
Constants.CHARSET_WINDOWS_1251,
Constants.CHARSET_WINDOWS_1252,
Constants.CHARSET_WINDOWS_1253,
Constants.CHARSET_WINDOWS_1255,
Constants.CHARSET_UTF_8,
Constants.CHARSET_UTF_16BE,
Constants.CHARSET_UTF_16LE,
Constants.CHARSET_UTF_32BE,
Constants.CHARSET_UTF_32LE,
Constants.CHARSET_TIS620,
Constants.CHARSET_US_ASCCI
};

/**
* Gets the charset of content from InputStream.
*
* @param inputStream InputStream containing text file
* @return The charset of the file, null if cannot be determined
* @throws IOException if some IO error occurs
*/
public static String detectCharset(InputStream inputStream)
throws IOException
{
String encoding = UniversalDetector.detectCharset(inputStream);
inputStream.reset();
return encoding;
}

/**
* Create a reader from a file with correct encoding
* @param inputStream The stream to read from
* @param defaultCharset defaultCharset to use if can't be determined
* @return BufferedReader for the file with the correct encoding
* @throws java.io.IOException if some I/O error ocurrs
*/
public static BufferedReader createBufferedReader(InputStream inputStream,
Charset defaultCharset)
throws IOException
{
Charset cs = (defaultCharset == null)?
Charset.forName("UTF-8"): defaultCharset;

String detectedEncoding = detectCharset(inputStream);

if (detectedEncoding != null)
cs = Charset.forName(detectedEncoding);

if (!cs.name().contains("UTF"))
return new BufferedReader
(new InputStreamReader(inputStream, cs));

return new BufferedReader
(new InputStreamReader
(new UnicodeBOMInputStream(inputStream), cs));
}
}

0 comments on commit 4898bdc

Please sign in to comment.