Skip to content

Commit

Permalink
ExternalUtf8ContentFilterFactory: Don't convert pointer offsets to char
Browse files Browse the repository at this point in the history
Instead, calculate difference while we read through the input during
parsing. This saves us a whole pass through the input files, which
should improve performance for non-filesystem based sources (where we
don't have the page cache to help us out).
  • Loading branch information
jbaiter committed Jul 8, 2024
1 parent d4c4b8b commit 7eb4e66
Show file tree
Hide file tree
Showing 7 changed files with 561 additions and 171 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package com.github.dbmdz.solrocr.lucene.filters;

import com.github.dbmdz.solrocr.reader.StreamDecoder;
import java.io.IOException;
import java.io.Reader;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.StandardCharsets;

/**
* A Reader implementation that reads from a SeekableByteChannel and allows repositioning the
* reader.
*/
public class ByteSeekableReader extends Reader {
private final SeekableByteChannel channel;
private StreamDecoder decoder;

public ByteSeekableReader(SeekableByteChannel channel) {
this.channel = channel;
this.decoder = StreamDecoder.forDecoder(channel, StandardCharsets.UTF_8.newDecoder(), -1);
}

@Override
public int read(char[] cbuf, int off, int len) throws IOException {
return this.decoder.read(cbuf, off, len);
}

/** Return the current byte position in the underlying channel. */
public int position() throws IOException {
return (int) this.channel.position();
}

/**
* Reposition the reader to the given byte position.
*
* <p>This will also reset the decoder.
*/
public void position(int newPosition) throws IOException {
this.channel.position(newPosition);
this.decoder = StreamDecoder.forDecoder(channel, StandardCharsets.UTF_8.newDecoder(), -1);
}

@Override
public void close() throws IOException {
this.channel.close();
}
}
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package com.github.dbmdz.solrocr.lucene.filters;

import com.github.dbmdz.solrocr.model.SourcePointer;
import com.github.dbmdz.solrocr.model.SourcePointer.Region;
import com.github.dbmdz.solrocr.util.SourceAwareReader;
import com.github.dbmdz.solrocr.util.Utf8;
import com.google.common.collect.ImmutableList;
import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;
import java.nio.channels.SeekableByteChannel;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
Expand All @@ -17,98 +20,132 @@ public class ExternalUtf8ContentFilter extends BaseCharFilter implements SourceA
* The cumulative offset difference between the input (bytes) and the output (chars) at the
* current position.
*
* <p>Used to calculate the <strong>byte</strong> offset in the input, given a
* <strong>char</strong> offset from the output of this filter.
*
* <pre>
* current actual byte offset in input = currentOutOffset + cumulative
* currentInputByteOffset = currentOutCharOffset + cumulativeOffsetDifference
* </pre>
*/
private int cumulative;
private int cumulativeOffsetDifference;

/** The current <strong>char</strong> offset in the full file; */
private int currentInOffset;
/**
* The current <strong>byte</strong> offset in the <strong>full</strong> input, i.e. the
* concatenated content of all files in the source pointer.
*/
private int currentInByteOffset;

/** The current <strong>char</strong> offset in the output. */
private int currentOutOffset;
/**
* The current <strong>char</strong> offset in the output, i.e. the concatenated and decoded
* content of all regions in the source pointer.
*/
private int currentOutCharOffset;

/** Source pointer of this reader, used for debugging and error reporting. */
private final String pointer;

private boolean nextIsOffset = false;
/** Whether the last seen character had more than 1 byte for a char */
private boolean lastCharHadMultipleBytes = false;

private final Queue<SourcePointer.Region> remainingRegions;
private SourcePointer.Region currentRegion;

public ExternalUtf8ContentFilter(Reader input, List<SourcePointer.Region> regions, String pointer)
public ExternalUtf8ContentFilter(
SeekableByteChannel channel, List<SourcePointer.Region> regions, String pointer)
throws IOException {
super(input);
// We need to be able to reposition the underlying reader, so we use our own implementation
// based on a SeekableByteChannel.
super(new ByteSeekableReader(channel));
if (regions == null || regions.isEmpty()) {
regions = ImmutableList.of(new Region(0, (int) channel.size()));
}
this.pointer = pointer;
this.currentOutOffset = 0;
this.currentInOffset = 0;
this.cumulative = 0;
this.currentOutCharOffset = 0;
this.currentInByteOffset = 0;
this.cumulativeOffsetDifference = 0;
this.remainingRegions = new LinkedList<>(regions);
currentRegion = remainingRegions.remove();
if (currentRegion.start > 0) {
this.addOffCorrectMap(currentOutOffset, currentRegion.startOffset);
this.cumulative += currentRegion.startOffset;
this.currentInOffset = (int) this.input.skip(currentRegion.start);
this.addOffCorrectMap(currentOutCharOffset, currentRegion.start);
this.cumulativeOffsetDifference += currentRegion.start;
this.currentInByteOffset = currentRegion.start;
((ByteSeekableReader) this.input).position(currentInByteOffset);
}
}

/**
* Read <tt>len</tt> <tt>char</tt>s into <tt>cbuf</tt>, starting from character index <tt>off</tt>
* relative to the beginning of <tt>cbuf</tt> and return the number of <tt>char</tt>s read.
* Read <tt>requestedCharLen</tt> <tt>char</tt>s into <tt>outputBuffer</tt>, starting from
* character index <tt>outputCharOffset</tt> relative to the beginning of <tt>outputBuffer</tt>
* and return the number of <tt>char</tt>s read.
*
* <p>Keeps track of the current byte offset in the input and the current char offset in the
* output.
*/
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
if (currentInOffset == currentRegion.end) {
public int read(char[] outputBuffer, int outputCharOffset, int requestedCharLen)
throws IOException {
if (currentInByteOffset == currentRegion.end) {
return -1;
}

int numCharsRead = 0;
while (len - numCharsRead > 0) {
int charsRemainingInRegion = currentRegion.end - currentInOffset;
int charsToRead = len - numCharsRead;
if (charsToRead > charsRemainingInRegion) {
charsToRead = charsRemainingInRegion;
while (requestedCharLen - numCharsRead > 0) {
int bytesRemainingInRegion = currentRegion.end - currentInByteOffset;
int charsToRead = requestedCharLen - numCharsRead;
if (charsToRead > bytesRemainingInRegion) {
charsToRead = bytesRemainingInRegion;
}

int read = this.input.read(cbuf, off, charsToRead);
if (read < 0) {
int charsRead = this.input.read(outputBuffer, outputCharOffset, charsToRead);
if (charsRead < 0) {
break;
}
correctOffsets(cbuf, off, read);
numCharsRead += read;
off += read;
while (Utf8.encodedLength(CharBuffer.wrap(outputBuffer, outputCharOffset, charsRead))
> bytesRemainingInRegion) {
charsRead--;
}
correctOffsets(outputBuffer, outputCharOffset, charsRead);
numCharsRead += charsRead;
outputCharOffset += charsRead;

if (currentInOffset == currentRegion.end) {
if (currentInByteOffset == currentRegion.end) {
if (remainingRegions.isEmpty()) {
break;
}
currentRegion = remainingRegions.remove();

cumulative = currentRegion.startOffset - currentOutOffset;
this.addOffCorrectMap(currentOutOffset, cumulative);
int toSkip = this.currentRegion.start - this.currentInOffset;
if (toSkip > 0) {
this.input.skip(this.currentRegion.start - this.currentInOffset);
cumulativeOffsetDifference = currentRegion.start - currentOutCharOffset;
this.addOffCorrectMap(currentOutCharOffset, cumulativeOffsetDifference);
if (this.currentRegion.start > this.currentInByteOffset) {
this.currentInByteOffset = currentRegion.start;
}
this.currentInOffset = currentRegion.start;
((ByteSeekableReader) this.input).position(this.currentInByteOffset);
}
}
return numCharsRead > 0 ? numCharsRead : -1;
}

private void correctOffsets(char[] cbuf, int off, int len) {
for (int i = off; i < off + len; i++) {
if (nextIsOffset) {
this.addOffCorrectMap(currentOutOffset, cumulative);
nextIsOffset = false;
/**
* Updates the current input and output offsets based on the characters read from the input.
*
* @param decodedChars Buffer of characters that were read from the input
* @param bufOffset Offset in decodedChars, where the stored characters start
* @param numChars Number of characters stored in decodedChars
*/
private void correctOffsets(char[] decodedChars, int bufOffset, int numChars) {
for (int i = bufOffset; i < bufOffset + numChars; ) {
if (lastCharHadMultipleBytes) {
this.addOffCorrectMap(currentOutCharOffset, cumulativeOffsetDifference);
lastCharHadMultipleBytes = false;
}
currentInOffset += 1;
currentOutOffset += 1;
int cp = Character.codePointAt(cbuf, i);
int increment = Utf8.encodedLength(cp) - 1;
if (increment > 0) {
cumulative += increment;
nextIsOffset = true;
currentOutCharOffset += 1;
int cp = Character.codePointAt(decodedChars, i);
i += Character.charCount(cp);
int encodedLen = Utf8.encodedLength(cp);
currentInByteOffset += encodedLen;
if (encodedLen > 1) {
cumulativeOffsetDifference += (encodedLen - 1);
lastCharHadMultipleBytes = true;
}
}
}
Expand Down
10 changes: 2 additions & 8 deletions src/main/java/com/github/dbmdz/solrocr/model/SourcePointer.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
import com.github.dbmdz.solrocr.reader.FileSourceReader;
import com.github.dbmdz.solrocr.reader.MultiFileSourceReader;
import com.github.dbmdz.solrocr.reader.SourceReader;
import com.google.common.collect.ImmutableList;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
Expand Down Expand Up @@ -75,7 +75,7 @@ static Source parse(String pointer) {
throw new RuntimeException("Could not parse source pointer from '" + pointer + ".");
}
String target = m.group("target");
List<Region> regions = ImmutableList.of();
List<Region> regions = new ArrayList<>();
if (m.group("regions") != null) {
regions =
Arrays.stream(m.group("regions").split(","))
Expand Down Expand Up @@ -124,7 +124,6 @@ public static class Region {

public int start;
public int end;
public int startOffset = 0;

public static Region parse(String r) {
if (r.startsWith(":")) {
Expand All @@ -142,11 +141,6 @@ public Region(int start, int end) {
this.end = end;
}

public Region(int start, int end, int startOffset) {
this(start, end);
this.startOffset = startOffset;
}

@Override
public String toString() {
return start + ":" + end;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
import java.nio.channels.SeekableByteChannel;

/** API for reading data from a source. */
public interface SourceReader {
public interface SourceReader extends AutoCloseable {
/** Close the resources associated with this reader. */
@Override
void close() throws IOException;

/** Get the pointer this reader is reading from. */
Expand Down
Loading

0 comments on commit 7eb4e66

Please sign in to comment.