From 6cdcbd479f7a6e56fc30e9664dd4885c2357862d Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Mon, 11 Dec 2023 10:47:57 -0500 Subject: [PATCH 1/5] Re-Ordering priority in SeekableStreamFactory * Now when constructing streams it will prefer NIO plugins. * If no http / ftp plugin exists it will fall back to the htsjdk bulit in * No longer special casing local files, these will now be handled through NIO as well. * Deprecating SeekableStreamFactory.isFilePath() since it is no longer used and interacts poorly with nio filesystem providers --- .../seekablestream/SeekableStreamFactory.java | 70 +++++++++++++------ .../java/htsjdk/tribble/FeatureCodec.java | 1 + .../tribble/TribbleIndexedFeatureReader.java | 13 ++-- .../SeekableStreamFactoryTest.java | 29 ++++++-- 4 files changed, 79 insertions(+), 34 deletions(-) diff --git a/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java b/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java index 7148323be4..5ddd95acd1 100644 --- a/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java +++ b/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java @@ -23,12 +23,17 @@ */ package htsjdk.samtools.seekablestream; +import htsjdk.io.HtsPath; +import htsjdk.io.IOPath; import htsjdk.samtools.util.IOUtil; +import htsjdk.tribble.TribbleException; + import java.io.File; import java.io.IOException; import java.net.URI; import java.net.URL; import java.nio.channels.SeekableByteChannel; +import java.nio.file.Path; import java.util.function.Function; /** @@ -61,9 +66,28 @@ public static ISeekableStreamFactory getInstance(){ * Does this path point to a regular file on disk and not something like a URL? * @param path the path to test * @return true if the path is to a file on disk + * @deprecated this method is simplistic and no longer particularly useful since IOPath allows similar access to + * various non-file data sources, internal use has been replaced with {@link #isSpecialCase(String)} */ + @Deprecated public static boolean isFilePath(final String path) { - return ! ( path.startsWith("http:") || path.startsWith("https:") || path.startsWith("ftp:") ); + return !isSpecialCaseUrlType(path); + } + + /** + * is this path being handled by one of the legacy SeekableStream types (http(s) / ftp) + * + * @param path a path to check + * @return if the path is not being handled by a FileSystemProvider and it can be read by legacy streams + */ + public static boolean isSpecialCase(final String path){ + return !new HtsPath(path).isPath() //if we have a provider for it that's what we'll use + && isSpecialCaseUrlType(path); // otherwise we fall back to the special handlers + } + + //is this onee of the url types that has legacy htsjdk support built in? + private static boolean isSpecialCaseUrlType(final String path) { + return path.startsWith("http:") || path.startsWith("https:") || path.startsWith("ftp:"); } private static class DefaultSeekableStreamFactory implements ISeekableStreamFactory { @@ -79,7 +103,7 @@ public SeekableStream getStreamFor(final String path) throws IOException { } /** - * The wrapper will only be applied to the stream if the stream is treated as a {@link java.nio.file.Path} + * The wrapper will only be applied to the stream if the stream is treated as a {@link Path} * * This currently means any uri with a scheme that is not http, https, ftp, or file will have the wrapper applied to it * @@ -89,26 +113,30 @@ public SeekableStream getStreamFor(final String path) throws IOException { @Override public SeekableStream getStreamFor(final String path, Function wrapper) throws IOException { - // todo -- add support for SeekableBlockInputStream - - if (path.startsWith("http:") || path.startsWith("https:")) { - final URL url = new URL(path); - return new SeekableHTTPStream(url); - } else if (path.startsWith("ftp:")) { - return new SeekableFTPStream(new URL(path)); - } else if (path.startsWith("file:")) { - try { - // convert to URI in order to obtain a decoded version of the path string suitable - // for use with the File constructor - final String decodedPath = new URI(path).getPath(); - return new SeekableFileStream(new File(decodedPath)); - } catch (java.net.URISyntaxException e) { - throw new IllegalArgumentException(String.format("The input string %s contains a URI scheme but is not a valid URI", path), e); - } - } else if (IOUtil.hasScheme(path)) { - return new SeekablePathStream(IOUtil.getPath(path), wrapper); + return getStreamFor(new HtsPath(path), wrapper); + } + + + /** + * The wrapper will only be applied to the stream if the stream is treated as a non file:// {@link Path} + * + * This has a fall back to htsjdk's built in http and ftp providers if no FileSystemProvder is available for them + * + * @param path an IOPath to be opened + * @param wrapper a wrapper to apply to the stream allowing direct transformations on the byte stream to be applied + * @throws IOException + */ + public static SeekableStream getStreamFor(final IOPath path, Function wrapper) throws IOException { + if(path.isPath()) { + return path.getScheme().equals("file") + ? new SeekablePathStream(path.toPath()) //don't apply the wrapper to local files + : new SeekablePathStream(path.toPath(), wrapper); } else { - return new SeekableFileStream(new File(path)); + return switch(path.getScheme()){ + case "http", "https" -> new SeekableHTTPStream(new URL(path.getURIString())); + case "ftp" -> new SeekableFTPStream((new URL(path.getURIString()))); + default -> throw new TribbleException("Unknown path type. No FileSystemProvider available for " + path.getRawInputString()); + }; } } diff --git a/src/main/java/htsjdk/tribble/FeatureCodec.java b/src/main/java/htsjdk/tribble/FeatureCodec.java index fc3e8f6181..68199b282f 100644 --- a/src/main/java/htsjdk/tribble/FeatureCodec.java +++ b/src/main/java/htsjdk/tribble/FeatureCodec.java @@ -18,6 +18,7 @@ package htsjdk.tribble; +import htsjdk.io.IOPath; import htsjdk.samtools.util.LocationAware; import htsjdk.tribble.index.tabix.TabixFormat; diff --git a/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java b/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java index 768c797ac0..fac9c0f109 100644 --- a/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java +++ b/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java @@ -23,6 +23,7 @@ */ package htsjdk.tribble; +import htsjdk.io.HtsPath; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.seekablestream.SeekableStreamFactory; import htsjdk.samtools.util.IOUtil; @@ -40,6 +41,7 @@ import java.net.URI; import java.net.URLEncoder; import java.nio.channels.SeekableByteChannel; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -60,9 +62,9 @@ public class TribbleIndexedFeatureReader extends Abst private Index index; /** - * is the path pointing to our source data a regular file? + * is the path backed by old style built in http(s) / ftp support instead of a FileSystemProvider */ - private final boolean pathIsRegularFile; + private final boolean pathIsOldStyleHttpOrFtp; /** * a potentially reusable seekable stream for queries over regular files @@ -97,8 +99,7 @@ public TribbleIndexedFeatureReader(final String featurePath, final FeatureCodec< } } - // does path point to a regular file? - this.pathIsRegularFile = SeekableStreamFactory.isFilePath(path); + this.pathIsOldStyleHttpOrFtp = SeekableStreamFactory.isSpecialCase(path); readHeader(); } @@ -203,7 +204,7 @@ private SeekableStream getSeekableStream() throws IOException { * @return true if */ private boolean reuseStreamInQuery() { - return pathIsRegularFile; + return !pathIsOldStyleHttpOrFtp; } @Override @@ -252,7 +253,7 @@ private void readHeader() throws IOException { PositionalBufferedStream pbs = null; try { is = ParsingUtils.openInputStream(path, wrapper); - if (IOUtil.hasBlockCompressedExtension(new URI(URLEncoder.encode(path, "UTF-8")))) { + if (IOUtil.hasBlockCompressedExtension(new URI(URLEncoder.encode(path, StandardCharsets.UTF_8)))) { // TODO: TEST/FIX THIS! https://github.com/samtools/htsjdk/issues/944 // TODO -- warning I don't think this can work, the buffered input stream screws up position is = new GZIPInputStream(new BufferedInputStream(is)); diff --git a/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java b/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java index c5e6126ee9..6ba47b024d 100644 --- a/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java +++ b/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java @@ -17,13 +17,28 @@ public class SeekableStreamFactoryTest extends HtsjdkTest { private static final File TEST_DATA_DIR = new File("src/test/resources/htsjdk/samtools"); - @Test - public void testIsFilePath() { - Assert.assertEquals(SeekableStreamFactory.isFilePath("x"), true); - Assert.assertEquals(SeekableStreamFactory.isFilePath(""), true); - Assert.assertEquals(SeekableStreamFactory.isFilePath("http://broadinstitute.org"), false); - Assert.assertEquals(SeekableStreamFactory.isFilePath("https://broadinstitute.org"), false); - Assert.assertEquals(SeekableStreamFactory.isFilePath("ftp://broadinstitute.org"), false); + @DataProvider + public Object[][] getSpecialCasePaths(){ + return new Object[][]{ + {"x", true}, + {"", true}, + {"http://broadinstitute.org", false}, + {"https://broadinstitute.org", false}, + {"ftp://broadinstitute.org", false} + }; + } + + @Test(dataProvider = "getSpecialCasePaths") + public void testIsFilePath(String path, boolean expected) { + Assert.assertEquals(SeekableStreamFactory.isFilePath(path), expected); + } + + + // this test isn't particularly useful since we're not testing the meaninful case of having the http-nio provider + // installed + @Test(dataProvider = "getSpecialCasePaths") + public void testIsSpecialCase(String path, boolean expected) { + Assert.assertEquals(SeekableStreamFactory.isSpecialCase(path), ! expected); } @DataProvider(name="getStreamForData") From b94965918634980c295c22d50e18e91d903e9402 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Mon, 11 Dec 2023 16:29:58 -0500 Subject: [PATCH 2/5] Also handling ParsingUtils --- .../htsjdk/samtools/SAMRecordSetBuilder.java | 1 + .../seekablestream/SeekableStreamFactory.java | 36 +++++---- .../tribble/TribbleIndexedFeatureReader.java | 3 +- .../htsjdk/tribble/util/ParsingUtils.java | 43 ++++++----- .../SeekableStreamFactoryTest.java | 32 ++++++-- .../htsjdk/tribble/util/ParsingUtilsTest.java | 74 ++++++++++--------- 6 files changed, 113 insertions(+), 76 deletions(-) diff --git a/src/main/java/htsjdk/samtools/SAMRecordSetBuilder.java b/src/main/java/htsjdk/samtools/SAMRecordSetBuilder.java index d1a48fa997..111c67ad49 100644 --- a/src/main/java/htsjdk/samtools/SAMRecordSetBuilder.java +++ b/src/main/java/htsjdk/samtools/SAMRecordSetBuilder.java @@ -54,6 +54,7 @@ public class SAMRecordSetBuilder implements Iterable { "chr21", "chr22", "chrX", "chrY", "chrM" }; + private static final String READ_GROUP_ID = "1"; private static final String SAMPLE = "FREE_SAMPLE"; private final Random random = new Random(TestUtil.RANDOM_SEED); diff --git a/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java b/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java index 5ddd95acd1..35d97bfe57 100644 --- a/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java +++ b/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java @@ -25,15 +25,13 @@ import htsjdk.io.HtsPath; import htsjdk.io.IOPath; -import htsjdk.samtools.util.IOUtil; import htsjdk.tribble.TribbleException; -import java.io.File; import java.io.IOException; -import java.net.URI; import java.net.URL; import java.nio.channels.SeekableByteChannel; import java.nio.file.Path; +import java.util.Set; import java.util.function.Function; /** @@ -45,6 +43,14 @@ public class SeekableStreamFactory{ private static final ISeekableStreamFactory DEFAULT_FACTORY; + private static final String HTTP = "http"; + private static final String HTTPS = "https"; + private static final String FTP = "ftp"; + /** + * the set of url schemes that have special support in htsjdk that isn't through a FileSystemProvider + */ + private static final Set URL_SCHEMES_WITH_LEGACY_SUPPORT = Set.of(HTTP, FTP, HTTPS); + public static final String FILE_SCHEME = "file"; private static ISeekableStreamFactory currentFactory; static{ @@ -67,11 +73,11 @@ public static ISeekableStreamFactory getInstance(){ * @param path the path to test * @return true if the path is to a file on disk * @deprecated this method is simplistic and no longer particularly useful since IOPath allows similar access to - * various non-file data sources, internal use has been replaced with {@link #isSpecialCase(String)} + * various non-file data sources, internal use has been replaced with {@link #isBeingHandledByLegacyUrlSupport(String)} */ @Deprecated public static boolean isFilePath(final String path) { - return !isSpecialCaseUrlType(path); + return !canBeHandledByLegacyUrlSupport(path); } /** @@ -80,14 +86,14 @@ public static boolean isFilePath(final String path) { * @param path a path to check * @return if the path is not being handled by a FileSystemProvider and it can be read by legacy streams */ - public static boolean isSpecialCase(final String path){ - return !new HtsPath(path).isPath() //if we have a provider for it that's what we'll use - && isSpecialCaseUrlType(path); // otherwise we fall back to the special handlers + public static boolean isBeingHandledByLegacyUrlSupport(final String path){ + return !new HtsPath(path).hasFileSystemProvider() //if we have a provider for it that's what we'll use + && canBeHandledByLegacyUrlSupport(path); // otherwise we fall back to the special handlers } //is this onee of the url types that has legacy htsjdk support built in? - private static boolean isSpecialCaseUrlType(final String path) { - return path.startsWith("http:") || path.startsWith("https:") || path.startsWith("ftp:"); + public static boolean canBeHandledByLegacyUrlSupport(final String path) { + return URL_SCHEMES_WITH_LEGACY_SUPPORT.stream().anyMatch(path::startsWith); } private static class DefaultSeekableStreamFactory implements ISeekableStreamFactory { @@ -127,14 +133,14 @@ public SeekableStream getStreamFor(final String path, * @throws IOException */ public static SeekableStream getStreamFor(final IOPath path, Function wrapper) throws IOException { - if(path.isPath()) { - return path.getScheme().equals("file") - ? new SeekablePathStream(path.toPath()) //don't apply the wrapper to local files + if(path.hasFileSystemProvider()) { + return path.getScheme().equals(FILE_SCHEME) + ? new SeekableFileStream(path.toPath().toFile()) //don't apply the wrapper to local files : new SeekablePathStream(path.toPath(), wrapper); } else { return switch(path.getScheme()){ - case "http", "https" -> new SeekableHTTPStream(new URL(path.getURIString())); - case "ftp" -> new SeekableFTPStream((new URL(path.getURIString()))); + case HTTP, HTTPS -> new SeekableHTTPStream(new URL(path.getURIString())); + case FTP -> new SeekableFTPStream((new URL(path.getURIString()))); default -> throw new TribbleException("Unknown path type. No FileSystemProvider available for " + path.getRawInputString()); }; } diff --git a/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java b/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java index fac9c0f109..9654f0cebe 100644 --- a/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java +++ b/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java @@ -23,7 +23,6 @@ */ package htsjdk.tribble; -import htsjdk.io.HtsPath; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.seekablestream.SeekableStreamFactory; import htsjdk.samtools.util.IOUtil; @@ -99,7 +98,7 @@ public TribbleIndexedFeatureReader(final String featurePath, final FeatureCodec< } } - this.pathIsOldStyleHttpOrFtp = SeekableStreamFactory.isSpecialCase(path); + this.pathIsOldStyleHttpOrFtp = SeekableStreamFactory.isBeingHandledByLegacyUrlSupport(path); readHeader(); } diff --git a/src/main/java/htsjdk/tribble/util/ParsingUtils.java b/src/main/java/htsjdk/tribble/util/ParsingUtils.java index 6b4470a72a..6ef9a2b520 100644 --- a/src/main/java/htsjdk/tribble/util/ParsingUtils.java +++ b/src/main/java/htsjdk/tribble/util/ParsingUtils.java @@ -23,16 +23,17 @@ */ package htsjdk.tribble.util; +import htsjdk.io.HtsPath; +import htsjdk.io.IOPath; import htsjdk.samtools.seekablestream.SeekablePathStream; +import htsjdk.samtools.seekablestream.SeekableStreamFactory; import htsjdk.samtools.util.IOUtil; + import java.awt.Color; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.lang.reflect.Constructor; import java.net.MalformedURLException; -import java.net.URI; import java.net.URL; import java.nio.channels.SeekableByteChannel; import java.nio.file.Files; @@ -49,7 +50,7 @@ public class ParsingUtils { private static URLHelperFactory urlHelperFactory = RemoteURLHelper::new; // HTML 4.1 color table, + orange and magenta - private static Map colorSymbols = new HashMap(); + private static final Map colorSymbols = new HashMap<>(); static { colorSymbols.put("white", "FFFFFF"); @@ -81,13 +82,13 @@ public static InputStream openInputStream(String path) return openInputStream(path, null); } - static private final Set URL_SCHEMES = new HashSet<>(Arrays.asList("http", "ftp", "https")); - /** * open an input stream from the given path and wrap the raw byte stream with a wrapper if given * - * the wrapper will only be applied to paths that are not http, https, ftp, or file, i.e. any {@link java.nio.file.Path} - * using a custom filesystem plugin + * the wrapper will only be applied to paths that are + * 1. not local files + * 2. not being handled by the legacy http(s)/ftp providers + * i.e. any {@link java.nio.file.Path} using a custom FileSystem plugin * @param uri a uri like string * @param wrapper to wrap the input stream in, may be used to implement caching or prefetching, etc * @return An inputStream appropriately created from uri and conditionally wrapped with wrapper (only in certain cases) @@ -95,18 +96,21 @@ public static InputStream openInputStream(String path) */ public static InputStream openInputStream(final String uri, final Function wrapper) throws IOException { - - final InputStream inputStream; - - if (URL_SCHEMES.stream().anyMatch(uri::startsWith)) { - inputStream = getURLHelper(new URL(uri)).openInputStream(); - } else if (!IOUtil.hasScheme(uri)) { - File file = new File(uri); - inputStream = Files.newInputStream(file.toPath()); + final IOPath path = new HtsPath(uri); + if(path.hasFileSystemProvider()){ + if(path.isPath()) { + return path.getScheme().equals("file") + ? Files.newInputStream(path.toPath()) + : new SeekablePathStream(path.toPath(), wrapper); + } else { + throw new IOException("FileSystemProvider for path " + path.getRawInputString() + " exits but failed to " + + " create path. \n" + path.getToPathFailureReason()); + } + } else if( SeekableStreamFactory.canBeHandledByLegacyUrlSupport(uri)){ + return getURLHelper(new URL(uri)).openInputStream(); } else { - inputStream = new SeekablePathStream(IOUtil.getPath(uri), wrapper); + throw new IOException("No FileSystemProvider available to handle path: " + path.getRawInputString()); } - return inputStream; } public static String join(String separator, Collection objects) { @@ -401,7 +405,8 @@ private static Color hexToColor(String string) { } - public static boolean resourceExists(String resource) throws IOException{ + public static boolean + resourceExists(String resource) throws IOException{ boolean remoteFile = resource.startsWith("http://") || resource.startsWith("https://") || resource.startsWith("ftp://"); if (remoteFile) { diff --git a/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java b/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java index 6ba47b024d..c9005ce19e 100644 --- a/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java +++ b/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java @@ -1,5 +1,7 @@ package htsjdk.samtools.seekablestream; +import com.google.common.jimfs.Configuration; +import com.google.common.jimfs.Jimfs; import htsjdk.HtsjdkTest; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.TestUtil; @@ -10,8 +12,11 @@ import java.io.File; import java.io.IOException; import java.net.MalformedURLException; -import java.net.URISyntaxException; import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.nio.file.FileSystem; +import java.nio.file.Files; +import java.nio.file.Path; import java.nio.file.Paths; public class SeekableStreamFactoryTest extends HtsjdkTest { @@ -29,6 +34,7 @@ public Object[][] getSpecialCasePaths(){ } @Test(dataProvider = "getSpecialCasePaths") + @Deprecated public void testIsFilePath(String path, boolean expected) { Assert.assertEquals(SeekableStreamFactory.isFilePath(path), expected); } @@ -37,8 +43,13 @@ public void testIsFilePath(String path, boolean expected) { // this test isn't particularly useful since we're not testing the meaninful case of having the http-nio provider // installed @Test(dataProvider = "getSpecialCasePaths") - public void testIsSpecialCase(String path, boolean expected) { - Assert.assertEquals(SeekableStreamFactory.isSpecialCase(path), ! expected); + public void testIsBeingHandledByLegacyUrlSupport(String path, boolean notExpected) { + Assert.assertEquals(SeekableStreamFactory.isBeingHandledByLegacyUrlSupport(path), !notExpected); + } + + @Test(dataProvider = "getSpecialCasePaths") + public void testCanBeHandledByLegacyUrlSuppoort(String path, boolean notExpected){ + Assert.assertEquals(SeekableStreamFactory.isBeingHandledByLegacyUrlSupport(path), !notExpected); } @DataProvider(name="getStreamForData") @@ -53,7 +64,7 @@ public Object[][] getStreamForData() throws MalformedURLException { { new URL(TestUtil.BASE_URL_FOR_HTTP_TESTS + "index_test.bam").toExternalForm(), new URL(TestUtil.BASE_URL_FOR_HTTP_TESTS + "index_test.bam").toExternalForm() }, { new URL(TestUtil.BASE_URL_FOR_HTTP_TESTS + "index_test.bam.bai").toExternalForm(), - new URL(TestUtil.BASE_URL_FOR_HTTP_TESTS + "index_test.bam.bai").toExternalForm() } + new URL(TestUtil.BASE_URL_FOR_HTTP_TESTS + "index_test.bam.bai").toExternalForm() }, }; } @@ -62,6 +73,7 @@ public void testGetStreamFor(final String path, final String expectedPath) throw Assert.assertEquals(SeekableStreamFactory.getInstance().getStreamFor(path).getSource(), expectedPath); } + @Test public void testPathWithEmbeddedSpace() throws IOException { final File testBam = new File(TEST_DATA_DIR, "BAMFileIndexTest/index_test.bam"); @@ -84,7 +96,17 @@ public void testPathWithEmbeddedSpace() throws IOException { final int BYTES_TO_READ = 10; Assert.assertEquals(seekableStream.read(new byte[BYTES_TO_READ], 0,BYTES_TO_READ), BYTES_TO_READ); } - } + @Test + public void testGetSeekableStreamWorksOnJimfs() throws IOException { + try(final FileSystem fs = Jimfs.newFileSystem(Configuration.unix())) { + final Path file = fs.getPath("/file"); + Files.writeString(file,"hello"); + try(final SeekableStream stream = SeekableStreamFactory.getInstance().getStreamFor(file.toUri().toString())){ + final byte[] bytes = stream.readAllBytes(); + Assert.assertEquals(new String(bytes, StandardCharsets.UTF_8), "hello"); + } + } + } } diff --git a/src/test/java/htsjdk/tribble/util/ParsingUtilsTest.java b/src/test/java/htsjdk/tribble/util/ParsingUtilsTest.java index d6e5d62148..7585a0fa28 100644 --- a/src/test/java/htsjdk/tribble/util/ParsingUtilsTest.java +++ b/src/test/java/htsjdk/tribble/util/ParsingUtilsTest.java @@ -9,6 +9,7 @@ import org.testng.annotations.Test; import java.io.*; +import java.nio.charset.StandardCharsets; import java.nio.file.FileSystem; import java.nio.file.Files; import java.nio.file.Path; @@ -126,93 +127,96 @@ public void testSplitJoinEmptyFirst() { public void testFileDoesExist() throws IOException{ File tempFile = File.createTempFile(getClass().getSimpleName(), ".tmp"); tempFile.deleteOnExit(); - tstExists(tempFile.getAbsolutePath(), true); - tstExists(tempFile.toURI().toString(), true); + testExists(tempFile.getAbsolutePath(), true); + testExists(tempFile.toURI().toString(), true);; } @Test public void testFileDoesNotExist() throws IOException{ File tempFile = File.createTempFile(getClass().getSimpleName(), ".tmp"); tempFile.delete(); - tstExists(tempFile.getAbsolutePath(), false); - tstExists(tempFile.toURI().toString(), false); + testExists(tempFile.getAbsolutePath(), false); + testExists(tempFile.toURI().toString(), false); } @Test public void testInMemoryNioFileDoesExist() throws IOException{ - FileSystem fs = Jimfs.newFileSystem(Configuration.unix()); - Path file = fs.getPath("/file"); - Files.createFile(file); - tstExists(file.toUri().toString(), true); + try(FileSystem fs = Jimfs.newFileSystem(Configuration.unix())) { + Path file = fs.getPath("/file"); + Files.createFile(file); + testExists(file.toUri().toString(), true); + } } @Test public void testInMemoryNioFileDoesNotExist() throws IOException{ - FileSystem fs = Jimfs.newFileSystem(Configuration.unix()); - Path file = fs.getPath("/file"); - tstExists(file.toUri().toString(), false); + try(FileSystem fs = Jimfs.newFileSystem(Configuration.unix())) { + Path file = fs.getPath("/file"); + testExists(file.toUri().toString(), false); + } } @Test(groups = "ftp") public void testFTPDoesExist() throws IOException{ - tstExists(AVAILABLE_FTP_URL, true); + testExists(AVAILABLE_FTP_URL, true); } @Test(groups = "ftp") public void testFTPNotExist() throws IOException{ - tstExists(UNAVAILABLE_FTP_URL, false); + testExists(UNAVAILABLE_FTP_URL, false); } @Test public void testHTTPDoesExist() throws IOException{ - tstExists(AVAILABLE_HTTP_URL, true); + testExists(AVAILABLE_HTTP_URL, true); } @Test public void testHTTPNotExist() throws IOException{ - tstExists(UNAVAILABLE_HTTP_URL, false); + testExists(UNAVAILABLE_HTTP_URL, false); } - private void tstExists(String path, boolean expectExists) throws IOException{ - boolean exists = ParsingUtils.resourceExists(path); - Assert.assertEquals(exists, expectExists); + + private static void testExists(String path, boolean expectExists) throws IOException{ + Assert.assertEquals(ParsingUtils.resourceExists(path), expectExists); } @Test public void testFileOpenInputStream() throws IOException{ File tempFile = File.createTempFile(getClass().getSimpleName(), ".tmp"); tempFile.deleteOnExit(); - OutputStream os = IOUtil.openFileForWriting(tempFile); - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(os)); - writer.write("hello"); - writer.close(); - tstStream(tempFile.getAbsolutePath()); - tstStream(tempFile.toURI().toString()); + try(Writer writer = new BufferedWriter(new OutputStreamWriter(IOUtil.openFileForWriting(tempFile)))) { + writer.write("hello"); + } + testStream(tempFile.getAbsolutePath()); + testStream(tempFile.toURI().toString()); } @Test public void testInMemoryNioFileOpenInputStream() throws IOException{ - FileSystem fs = Jimfs.newFileSystem(Configuration.unix()); - Path file = fs.getPath("/file"); - Files.write(file, "hello".getBytes("UTF-8")); - tstStream(file.toUri().toString()); + try(FileSystem fs = Jimfs.newFileSystem(Configuration.unix())) { + Path file = fs.getPath("/file"); + Files.write(file, "hello".getBytes(StandardCharsets.UTF_8)); + testStream(file.toUri().toString()); + } } @Test(groups = "ftp") public void testFTPOpenInputStream() throws IOException{ - tstStream(AVAILABLE_FTP_URL); + testStream(AVAILABLE_FTP_URL); } @Test public void testHTTPOpenInputStream() throws IOException{ - tstStream(AVAILABLE_HTTP_URL); + testStream(AVAILABLE_HTTP_URL); } - private void tstStream(String path) throws IOException{ - InputStream is = ParsingUtils.openInputStream(path); - Assert.assertNotNull(is, "InputStream is null for " + path); - int b = is.read(); - Assert.assertNotSame(b, -1); + private static void testStream(String path) throws IOException{ + try(InputStream is = ParsingUtils.openInputStream(path)) { + Assert.assertNotNull(is, "InputStream is null for " + path); + int b = is.read(); + Assert.assertNotSame(b, -1); + } } From 9eca91acc8f15ebd32a4701999bfcf85baaef4a8 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Tue, 12 Dec 2023 14:42:07 -0500 Subject: [PATCH 3/5] update HtsPath to throw on non file schemes with malformed URIs instead of trying to interpret them as file:// --- src/main/java/htsjdk/io/HtsPath.java | 24 +++++++++++++++++++ .../seekablestream/SeekableStreamFactory.java | 4 ++-- .../tribble/TribbleIndexedFeatureReader.java | 2 +- .../java/htsjdk/tribble/util/FTPHelper.java | 8 ++++++- .../htsjdk/tribble/util/ParsingUtils.java | 10 ++++---- src/test/java/htsjdk/io/HtsPathUnitTest.java | 3 +++ .../SeekableStreamFactoryTest.java | 3 ++- .../tribble/AbstractFeatureReaderTest.java | 2 +- 8 files changed, 45 insertions(+), 11 deletions(-) diff --git a/src/main/java/htsjdk/io/HtsPath.java b/src/main/java/htsjdk/io/HtsPath.java index 9e9d50a450..c0ddd4ea90 100644 --- a/src/main/java/htsjdk/io/HtsPath.java +++ b/src/main/java/htsjdk/io/HtsPath.java @@ -253,6 +253,10 @@ private URI getURIForString(final String pathString) { tempURI = getCachedPath().toUri(); } } catch (URISyntaxException uriException) { + //check that the uri wasn't a badly encoded absolute uri of some sort + //if you don't do this it will be treated as a badly formed file:/ url + assertNoNonFileScheme(pathString, uriException); + // the input string isn't a valid URI; assume its a local (non-URI) file reference, and // use the URI resulting from the corresponding Path try { @@ -276,5 +280,25 @@ private URI getURIForString(final String pathString) { return tempURI; } + + /** + * check that there isn't a non file scheme at the start of the path + * @param pathString + * @param cause + */ + private static void assertNoNonFileScheme(String pathString, URISyntaxException cause){ + final String[] split = pathString.split(":"); + if(split.length > 1){ + if(split[0] == null || split[0].isEmpty()){ + throw new IllegalArgumentException("Malformed url " + pathString + " includes an empty scheme." + + "\nCheck that it is fully encoded.", cause); + } + if(!split[0].equals("file")){ + throw new IllegalArgumentException("Malformed url " + pathString + " includes a scheme: " + split[0] + ":/ but was an invalid URI." + + "\nCheck that it is fully encoded.", cause); + } + } + + } } diff --git a/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java b/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java index 35d97bfe57..b1371b1daf 100644 --- a/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java +++ b/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java @@ -139,8 +139,8 @@ public static SeekableStream getStreamFor(final IOPath path, Function new SeekableHTTPStream(new URL(path.getURIString())); - case FTP -> new SeekableFTPStream((new URL(path.getURIString()))); + case HTTP, HTTPS -> new SeekableHTTPStream(new URL(path.getRawInputString())); + case FTP -> new SeekableFTPStream((new URL(path.getRawInputString()))); default -> throw new TribbleException("Unknown path type. No FileSystemProvider available for " + path.getRawInputString()); }; } diff --git a/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java b/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java index 9654f0cebe..4460adf107 100644 --- a/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java +++ b/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java @@ -252,7 +252,7 @@ private void readHeader() throws IOException { PositionalBufferedStream pbs = null; try { is = ParsingUtils.openInputStream(path, wrapper); - if (IOUtil.hasBlockCompressedExtension(new URI(URLEncoder.encode(path, StandardCharsets.UTF_8)))) { + if (IOUtil.hasBlockCompressedExtension(new URI(path))) { // TODO: TEST/FIX THIS! https://github.com/samtools/htsjdk/issues/944 // TODO -- warning I don't think this can work, the buffered input stream screws up position is = new GZIPInputStream(new BufferedInputStream(is)); diff --git a/src/main/java/htsjdk/tribble/util/FTPHelper.java b/src/main/java/htsjdk/tribble/util/FTPHelper.java index e207cd45e7..19b5f77bf1 100644 --- a/src/main/java/htsjdk/tribble/util/FTPHelper.java +++ b/src/main/java/htsjdk/tribble/util/FTPHelper.java @@ -6,6 +6,7 @@ import java.io.IOException; import java.io.InputStream; +import java.net.URISyntaxException; import java.net.URL; /** @@ -35,7 +36,12 @@ public long getContentLength() throws IOException { @Override public InputStream openInputStream() throws IOException { - String file = url.getPath(); + String file = null; + try { + file = url.toURI().getPath(); + } catch (URISyntaxException e) { + throw new IOException(e); + } FTPClient ftp = FTPUtils.connect(url.getHost(), url.getUserInfo(), null); ftp.pasv(); ftp.retr(file); diff --git a/src/main/java/htsjdk/tribble/util/ParsingUtils.java b/src/main/java/htsjdk/tribble/util/ParsingUtils.java index 6ef9a2b520..7eaefb730d 100644 --- a/src/main/java/htsjdk/tribble/util/ParsingUtils.java +++ b/src/main/java/htsjdk/tribble/util/ParsingUtils.java @@ -34,7 +34,9 @@ import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; +import java.net.URI; import java.net.URL; +import java.net.URLEncoder; import java.nio.channels.SeekableByteChannel; import java.nio.file.Files; import java.util.*; @@ -405,12 +407,10 @@ private static Color hexToColor(String string) { } - public static boolean - resourceExists(String resource) throws IOException{ - - boolean remoteFile = resource.startsWith("http://") || resource.startsWith("https://") || resource.startsWith("ftp://"); + public static boolean resourceExists(String resource) throws IOException{ + boolean remoteFile = SeekableStreamFactory.isBeingHandledByLegacyUrlSupport(resource); if (remoteFile) { - URL url = null; + URL url; try { url = new URL(resource); } catch (MalformedURLException e) { diff --git a/src/test/java/htsjdk/io/HtsPathUnitTest.java b/src/test/java/htsjdk/io/HtsPathUnitTest.java index 76135fce4b..c5ffe2c67d 100644 --- a/src/test/java/htsjdk/io/HtsPathUnitTest.java +++ b/src/test/java/htsjdk/io/HtsPathUnitTest.java @@ -92,6 +92,8 @@ public Object[][] validHtsPath() { {"gcs://abucket/bucket", "gcs://abucket/bucket", false, false}, {"gendb://somegdb", "gendb://somegdb", false, false}, {"chr1:1-100", "chr1:1-100", false, false}, + {"ftp://broad.org/file", "ftp://broad.org/file", false, false}, + {"ftp://broad.org/with%20space", "ftp://broad.org/with%20space", false, false}, //********************************************************************************************** // Valid URIs which ARE valid NIO URIs (there *IS* an installed file system provider), but are @@ -167,6 +169,7 @@ public Object[][] invalidHtsPath() { // the nul character is rejected on all of the supported platforms in both local // filenames and URIs, so use it to test HtsPath constructor failure on all platforms {"\0"}, + {"ftp://broad.org/file with space"} // this has a non-file scheme but isn't encoded properly }; } diff --git a/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java b/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java index c9005ce19e..d4f0af5227 100644 --- a/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java +++ b/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java @@ -29,7 +29,8 @@ public Object[][] getSpecialCasePaths(){ {"", true}, {"http://broadinstitute.org", false}, {"https://broadinstitute.org", false}, - {"ftp://broadinstitute.org", false} + {"ftp://broadinstitute.org", false}, + {"ftp://broadinstitute.org/file%20with%20spaces", false} }; } diff --git a/src/test/java/htsjdk/tribble/AbstractFeatureReaderTest.java b/src/test/java/htsjdk/tribble/AbstractFeatureReaderTest.java index f212dbb3d5..1a6907de55 100644 --- a/src/test/java/htsjdk/tribble/AbstractFeatureReaderTest.java +++ b/src/test/java/htsjdk/tribble/AbstractFeatureReaderTest.java @@ -72,7 +72,7 @@ public void testVcfOverHTTP() throws IOException { @Test(groups = "ftp") public void testLoadBEDFTP() throws Exception { - final String path = "ftp://ftp.broadinstitute.org/distribution/igv/TEST/cpgIslands with spaces.hg18.bed"; + final String path = "ftp://ftp.broadinstitute.org/distribution/igv/TEST/cpgIslands%20with%20spaces.hg18.bed"; final BEDCodec codec = new BEDCodec(); final AbstractFeatureReader bfs = AbstractFeatureReader.getFeatureReader(path, codec, false); for (final Feature feat : bfs.iterator()) { From 5376f559fa9604dbe9ca920fa122e1414ce9714e Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Tue, 12 Dec 2023 16:17:06 -0500 Subject: [PATCH 4/5] try this --- src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java b/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java index 4460adf107..f8fcee1619 100644 --- a/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java +++ b/src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java @@ -23,6 +23,7 @@ */ package htsjdk.tribble; +import htsjdk.io.HtsPath; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.seekablestream.SeekableStreamFactory; import htsjdk.samtools.util.IOUtil; @@ -252,7 +253,7 @@ private void readHeader() throws IOException { PositionalBufferedStream pbs = null; try { is = ParsingUtils.openInputStream(path, wrapper); - if (IOUtil.hasBlockCompressedExtension(new URI(path))) { + if (IOUtil.hasBlockCompressedExtension(new HtsPath(path).getURI())) { // TODO: TEST/FIX THIS! https://github.com/samtools/htsjdk/issues/944 // TODO -- warning I don't think this can work, the buffered input stream screws up position is = new GZIPInputStream(new BufferedInputStream(is)); From 652eda26fdb5f101d085b9607ce80e77b65688ca Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Tue, 12 Dec 2023 17:08:05 -0500 Subject: [PATCH 5/5] respond to comments --- src/main/java/htsjdk/io/HtsPath.java | 4 ++-- .../htsjdk/samtools/seekablestream/SeekableStreamFactory.java | 4 ++-- .../samtools/seekablestream/SeekableStreamFactoryTest.java | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/htsjdk/io/HtsPath.java b/src/main/java/htsjdk/io/HtsPath.java index c0ddd4ea90..c16090b450 100644 --- a/src/main/java/htsjdk/io/HtsPath.java +++ b/src/main/java/htsjdk/io/HtsPath.java @@ -254,7 +254,7 @@ private URI getURIForString(final String pathString) { } } catch (URISyntaxException uriException) { //check that the uri wasn't a badly encoded absolute uri of some sort - //if you don't do this it will be treated as a badly formed file:/ url + //if you don't do this it will be treated as a badly formed file:// url assertNoNonFileScheme(pathString, uriException); // the input string isn't a valid URI; assume its a local (non-URI) file reference, and @@ -294,7 +294,7 @@ private static void assertNoNonFileScheme(String pathString, URISyntaxException "\nCheck that it is fully encoded.", cause); } if(!split[0].equals("file")){ - throw new IllegalArgumentException("Malformed url " + pathString + " includes a scheme: " + split[0] + ":/ but was an invalid URI." + + throw new IllegalArgumentException("Malformed url " + pathString + " includes a scheme: " + split[0] + ":// but was an invalid URI." + "\nCheck that it is fully encoded.", cause); } } diff --git a/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java b/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java index b1371b1daf..e0716d97e4 100644 --- a/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java +++ b/src/main/java/htsjdk/samtools/seekablestream/SeekableStreamFactory.java @@ -91,9 +91,9 @@ public static boolean isBeingHandledByLegacyUrlSupport(final String path){ && canBeHandledByLegacyUrlSupport(path); // otherwise we fall back to the special handlers } - //is this onee of the url types that has legacy htsjdk support built in? + //is this one of the url types that has legacy htsjdk support built in? public static boolean canBeHandledByLegacyUrlSupport(final String path) { - return URL_SCHEMES_WITH_LEGACY_SUPPORT.stream().anyMatch(path::startsWith); + return URL_SCHEMES_WITH_LEGACY_SUPPORT.stream().anyMatch(scheme-> path.startsWith(scheme +"://")); } private static class DefaultSeekableStreamFactory implements ISeekableStreamFactory { diff --git a/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java b/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java index d4f0af5227..0c890ac8ff 100644 --- a/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java +++ b/src/test/java/htsjdk/samtools/seekablestream/SeekableStreamFactoryTest.java @@ -49,8 +49,8 @@ public void testIsBeingHandledByLegacyUrlSupport(String path, boolean notExpecte } @Test(dataProvider = "getSpecialCasePaths") - public void testCanBeHandledByLegacyUrlSuppoort(String path, boolean notExpected){ - Assert.assertEquals(SeekableStreamFactory.isBeingHandledByLegacyUrlSupport(path), !notExpected); + public void testCanBeHandledByLegacyUrlSupport(String path, boolean notExpected){ + Assert.assertEquals(SeekableStreamFactory.canBeHandledByLegacyUrlSupport(path), !notExpected); } @DataProvider(name="getStreamForData")