Skip to content

Commit

Permalink
Support charsets other than UTF-8
Browse files Browse the repository at this point in the history
  • Loading branch information
reline committed Jan 16, 2021
1 parent 2b0a191 commit 3b4e77c
Show file tree
Hide file tree
Showing 8 changed files with 222 additions and 68 deletions.
17 changes: 15 additions & 2 deletions core/src/main/java/com/tickaroo/tikxml/TikXml.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import com.tickaroo.tikxml.typeadapter.TypeAdapter;
import java.io.IOException;
import java.lang.reflect.Type;
import java.nio.charset.Charset;

import okio.BufferedSink;
import okio.BufferedSource;

Expand Down Expand Up @@ -64,6 +66,17 @@ public Builder writeDefaultXmlDeclaration(boolean writeDeclaration) {
return this;
}

/**
* Specify the charset
*
* @param charset character encoding set to use when reading and writing the xml document
* @return The Builder itself
*/
public Builder charset(Charset charset) {
config.charset = charset;
return this;
}

/**
* Adds an type converter for the given class
*
Expand Down Expand Up @@ -105,7 +118,7 @@ private TikXml(TikXmlConfig config) {

public <T> T read(BufferedSource source, Type clazz) throws IOException {

XmlReader reader = XmlReader.of(source);
XmlReader reader = XmlReader.of(source, config.charset);

reader.beginElement();
reader.nextElementName(); // We don't care about the name of the root tag
Expand All @@ -125,7 +138,7 @@ public <T> void write(BufferedSink sink, T valueToWrite) throws IOException {

public <T> void write(BufferedSink sink, T valueToWrite, Type typeOfValueToWrite) throws IOException {

XmlWriter writer = XmlWriter.of(sink);
XmlWriter writer = XmlWriter.of(sink, config.charset);

TypeAdapter<T> adapter = config.getTypeAdapter(typeOfValueToWrite);
if (config.writeDefaultXmlDeclaration()) {
Expand Down
12 changes: 12 additions & 0 deletions core/src/main/java/com/tickaroo/tikxml/TikXmlConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import com.tickaroo.tikxml.typeadapter.TypeAdapter;

import java.lang.reflect.Type;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

/**
* Holds the config for parsing and writing xml via {@link TikXml}
Expand All @@ -34,6 +36,7 @@ public final class TikXmlConfig {
TypeConverters typeConverters = new TypeConverters();
TypeAdapters typeAdapters = new TypeAdapters();
boolean writeDefaultXmlDeclaration = true;
Charset charset = StandardCharsets.UTF_8;

TikXmlConfig() {
}
Expand All @@ -58,6 +61,15 @@ public boolean writeDefaultXmlDeclaration() {
return writeDefaultXmlDeclaration;
}

/**
* The charset
*
* @return character encoding set to use when reading and writing the xml document
*/
public Charset charset() {
return charset;
}

/**
* Query a {@link TypeConverter} for a given class
*
Expand Down
67 changes: 40 additions & 27 deletions core/src/main/java/com/tickaroo/tikxml/XmlReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import java.io.Closeable;
import java.io.EOFException;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

/**
* A class to read and parse an xml stream.
Expand All @@ -37,14 +39,13 @@ public class XmlReader implements Closeable {

//private static final ByteString LINEFEED_OR_CARRIAGE_RETURN = ByteString.encodeUtf8("\n\r");

private static final ByteString UNQUOTED_STRING_TERMINALS
= ByteString.encodeUtf8(" >/=\n");
private final ByteString unquotedStringTerminals;

private static final ByteString CDATA_CLOSE = ByteString.encodeUtf8("]]>");
private static final ByteString CDATA_OPEN = ByteString.encodeUtf8("<![CDATA[");
private static final ByteString DOCTYPE_OPEN = ByteString.encodeUtf8("<!DOCTYPE");
private static final ByteString COMMENT_CLOSE = ByteString.encodeUtf8("-->");
private static final ByteString XML_DECLARATION_CLOSE = ByteString.encodeUtf8("?>");
private final ByteString cdataClose;
private final ByteString cdataOpen;
private final ByteString doctypeOpen;
private final ByteString commentClose;
private final ByteString xmlDeclarationClose;
private static final ByteString UTF8_BOM = ByteString.of((byte) 0xEF, (byte) 0xBB, (byte) 0xBF);

private static final byte DOUBLE_QUOTE = '"';
Expand Down Expand Up @@ -97,21 +98,33 @@ public class XmlReader implements Closeable {

private final BufferedSource source;
private final Buffer buffer;
private final Charset charset;
private String currentElementName;

private XmlReader(BufferedSource source) {
private XmlReader(BufferedSource source, Charset charset) {
if (source == null) {
throw new NullPointerException("source == null");
}
this.source = source;
this.buffer = source.buffer();
this.charset = charset;
unquotedStringTerminals = ByteString.encodeString(" >/=\n", charset);
cdataClose = ByteString.encodeString("]]>", charset);
cdataOpen = ByteString.encodeString("<![CDATA[", charset);
doctypeOpen = ByteString.encodeString("<!DOCTYPE", charset);
commentClose = ByteString.encodeString("-->", charset);
xmlDeclarationClose = ByteString.encodeString("?>", charset);
}

/**
* Returns a new instance that reads a XML-encoded stream from {@code source}.
*/
public static XmlReader of(BufferedSource source) {
return new XmlReader(source);
return new XmlReader(source, StandardCharsets.UTF_8);
}

public static XmlReader of(BufferedSource source, Charset charset) {
return new XmlReader(source, charset);
}

/**
Expand Down Expand Up @@ -313,7 +326,7 @@ private int doPeek() throws IOException {
* @throws IOException
*/
private boolean isCDATA() throws IOException {
return fillBuffer(CDATA_OPEN.size()) && buffer.rangeEquals(0, CDATA_OPEN);
return fillBuffer(cdataOpen.size()) && buffer.rangeEquals(0, cdataOpen);
}

/**
Expand All @@ -324,8 +337,8 @@ private boolean isCDATA() throws IOException {
* @throws IOException
*/
private boolean isDocTypeDefinition() throws IOException {
return buffer.size() >= DOCTYPE_OPEN.size() &&
buffer.snapshot(DOCTYPE_OPEN.size()).toAsciiUppercase().equals(DOCTYPE_OPEN);
return buffer.size() >= doctypeOpen.size() &&
buffer.snapshot(doctypeOpen.size()).toAsciiUppercase().equals(doctypeOpen);
}

/**
Expand Down Expand Up @@ -564,14 +577,14 @@ public String nextTextContent() throws IOException {
+ "> but haven't found");
}

return buffer.readUtf8(index);
return buffer.readString(index, charset);
} else if (p == PEEKED_CDATA) {
peeked = PEEKED_NONE;

// Search index of closing CDATA tag ]]>
long index = indexOfClosingCDATA();

String result = buffer.readUtf8(index);
String result = buffer.readString(index, charset);
buffer.skip(3); // consume ]]>
return result;
} else if (p == PEEKED_ELEMENT_END) {
Expand Down Expand Up @@ -673,7 +686,7 @@ public boolean nextTextContentAsBoolean() throws IOException {
* @throws IOException
*/
private long indexOfClosingCDATA() throws IOException {
long index = source.indexOf(CDATA_CLOSE);
long index = source.indexOf(cdataClose);
if (index == -1) {
throw new EOFException("<![CDATA[ at " + getPath() + " has never been closed with ]]>");
}
Expand Down Expand Up @@ -810,12 +823,12 @@ private int nextNonWhitespace(boolean throwOnEof, boolean isDocumentBeginning) t
int peekStack = stack[stackSize - 1];

if (peekStack == XmlScope.NONEMPTY_DOCUMENT && isDocTypeDefinition()) {
long index = source.indexOf(CLOSING_XML_ELEMENT, DOCTYPE_OPEN.size());
long index = source.indexOf(CLOSING_XML_ELEMENT, doctypeOpen.size());
if (index == -1) {
throw syntaxError("Unterminated <!DOCTYPE> . Inline DOCTYPE is not support at the moment.");
}
// check if doctype uses brackets
long bracketIndex = source.indexOf(OPENING_DOCTYPE_BRACKET, DOCTYPE_OPEN.size(), index);
long bracketIndex = source.indexOf(OPENING_DOCTYPE_BRACKET, doctypeOpen.size(), index);
if (bracketIndex != -1) {
index = source.indexOf(ByteString.of(CLOSING_DOCTYPE_BRACKET, CLOSING_XML_ELEMENT), index + bracketIndex);
if (index == -1) {
Expand All @@ -829,19 +842,19 @@ private int nextNonWhitespace(boolean throwOnEof, boolean isDocumentBeginning) t
p = 0;
continue;
} else if (peek == '!' && fillBuffer(4)) {
long index = source.indexOf(COMMENT_CLOSE, 4); // skip <!-- in comparison by offset 4
long index = source.indexOf(commentClose, 4); // skip <!-- in comparison by offset 4
if (index == -1) {
throw syntaxError("Unterminated comment");
}
source.skip(index + COMMENT_CLOSE.size()); // skip behind --!>
source.skip(index + commentClose.size()); // skip behind --!>
p = 0;
continue;
} else if (peek == '?') {
long index = source.indexOf(XML_DECLARATION_CLOSE, 2); // skip <? in comparison by offset 2
long index = source.indexOf(xmlDeclarationClose, 2); // skip <? in comparison by offset 2
if (index == -1) {
throw syntaxError("Unterminated xml declaration or processing instruction \"<?\"");
}
source.skip(index + XML_DECLARATION_CLOSE.size()); // skip behind ?>
source.skip(index + xmlDeclarationClose.size()); // skip behind ?>
p = 0;
continue;
}
Expand Down Expand Up @@ -896,8 +909,8 @@ public String getCurrentElementName() {

/** Returns an unquoted value as a string. */
private String nextUnquotedValue() throws IOException {
long i = source.indexOfElement(UNQUOTED_STRING_TERMINALS);
return i != -1 ? buffer.readUtf8(i) : buffer.readUtf8();
long i = source.indexOfElement(unquotedStringTerminals);
return i != -1 ? buffer.readString(i, charset) : buffer.readString(charset);
}

/**
Expand All @@ -920,19 +933,19 @@ private String nextQuotedValue(byte runTerminator) throws IOException {
// If we've got an escape character, we're going to need a string builder.
if (buffer.getByte(index) == '\\') {
if (builder == null) builder = new StringBuilder();
builder.append(buffer.readUtf8(index));
builder.append(buffer.readString(index, charset));
buffer.readByte(); // '\'
builder.append(readEscapeCharacter());
continue;
}

// If it isn't the escape character, it's the quote. Return the string.
if (builder == null) {
String result = buffer.readUtf8(index);
String result = buffer.readString(index, charset);
buffer.readByte(); // Consume the quote character.
return result;
} else {
builder.append(buffer.readUtf8(index));
builder.append(buffer.readString(index, charset));
buffer.readByte(); // Consume the quote character.
return builder.toString();
}
Expand Down Expand Up @@ -988,7 +1001,7 @@ private char readEscapeCharacter() throws IOException {
} else if (c >= 'A' && c <= 'F') {
result += (c - 'A' + 10);
} else {
throw syntaxError("\\u" + buffer.readUtf8(4));
throw syntaxError("\\u" + buffer.readString(4, charset));
}
}
buffer.skip(4);
Expand Down
Loading

0 comments on commit 3b4e77c

Please sign in to comment.