From c34a48b0a5009530092c614b04dde02f4b80e68b Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Tue, 5 Sep 2023 10:52:15 -0400 Subject: [PATCH] Lenient VCF 4.4 read switch. --- build.gradle | 11 ++++- src/main/java/htsjdk/samtools/Defaults.java | 8 +++- .../java/htsjdk/variant/vcf/VCFCodec.java | 12 ++++-- .../htsjdk/variant/vcf/VCFHeaderVersion.java | 3 +- .../htsjdk/variant/vcf/VCFFileReaderTest.java | 16 ++++++- .../htsjdk/variant/VCF4_4HeaderTest.vcf | 42 +++++++++++++++++++ 6 files changed, 84 insertions(+), 8 deletions(-) create mode 100644 src/test/resources/htsjdk/variant/VCF4_4HeaderTest.vcf diff --git a/build.gradle b/build.gradle index 948e7e7b84..eef637763f 100644 --- a/build.gradle +++ b/build.gradle @@ -129,6 +129,15 @@ task testWithDefaultReference(type: Test) { } } +task testWithLenientVCF4_4(type: Test) { + description = "Run tests with lenient VCF 4.4 reading" + jvmArgs += '-Dsamjdk.lenient_vcf_4_4=true' + + useTestNG { + includeGroups "lenient_vcf_4_4" + } +} + test { description = "Runs the unit tests other than the SRA tests" @@ -139,7 +148,7 @@ test { excludeGroups "slow", "broken", "defaultReference", "ftp", "http", "sra", "ena", "unix" } } -} dependsOn testWithDefaultReference +} dependsOn testWithDefaultReference, testWithLenientVCF4_4 task testFTP(type: Test) { diff --git a/src/main/java/htsjdk/samtools/Defaults.java b/src/main/java/htsjdk/samtools/Defaults.java index 0cfff05d5d..1fae385218 100644 --- a/src/main/java/htsjdk/samtools/Defaults.java +++ b/src/main/java/htsjdk/samtools/Defaults.java @@ -59,6 +59,9 @@ public class Defaults { */ public static final String DEFAULT_VCF_EXTENSION; + // accept VCF 4.4 files for read, but treat them as VCF 4.3 + public static final boolean LENIENT_VCF_4_4; + /** * Even if BUFFER_SIZE is 0, this is guaranteed to be non-zero. If BUFFER_SIZE is non-zero, * this == BUFFER_SIZE (Default = 128k). @@ -105,12 +108,14 @@ public class Defaults { */ public static final String DISABLE_SNAPPY_PROPERTY_NAME = "snappy.disable"; + public static final String LENIENT_VCF_4_4_PROPERTY = "lenient_vcf_4_4"; + + /** * Disable use of the Snappy compressor. Default = false. */ public static final boolean DISABLE_SNAPPY_COMPRESSOR; - public static final String SAMJDK_PREFIX = "samjdk."; static { CREATE_INDEX = getBooleanProperty("create_index", false); @@ -134,6 +139,7 @@ public class Defaults { SAM_FLAG_FIELD_FORMAT = SamFlagField.valueOf(getStringProperty("sam_flag_field_format", SamFlagField.DECIMAL.name())); SRA_LIBRARIES_DOWNLOAD = getBooleanProperty("sra_libraries_download", false); DISABLE_SNAPPY_COMPRESSOR = getBooleanProperty(DISABLE_SNAPPY_PROPERTY_NAME, false); + LENIENT_VCF_4_4 = getBooleanProperty(LENIENT_VCF_4_4_PROPERTY, false); } /** diff --git a/src/main/java/htsjdk/variant/vcf/VCFCodec.java b/src/main/java/htsjdk/variant/vcf/VCFCodec.java index 42f07150d1..959b9b5f88 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCodec.java @@ -25,6 +25,8 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.Defaults; +import htsjdk.samtools.util.Log; import htsjdk.tribble.TribbleException; import htsjdk.tribble.readers.LineIterator; @@ -72,6 +74,8 @@ * @since 2010 */ public class VCFCodec extends AbstractVCFCodec { + private final static Log log = Log.getInstance(VCFCodec.class); + // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; @@ -96,9 +100,11 @@ public Object readActualHeader(final LineIterator lineIterator) { throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); foundHeaderVersion = true; version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); - if ( ! version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]); - if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 && version != VCFHeaderVersion.VCF4_3) + if (Defaults.LENIENT_VCF_4_4 == true && version == VCFHeaderVersion.VCF4_4 ) { + // if lenient is enabled, accept VCFv4.4 as input, but treat it as VCFv4.3, and hope for the best + log.warn("********** WARNING: VCFv4.4 is not yet supported - processing as VCF4.3! **********"); + version = VCFHeaderVersion.VCF4_3; + } else if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 && version != VCFHeaderVersion.VCF4_3) throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]); } headerStrings.add(lineIterator.next()); diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java index 43f43c65c3..45264938e5 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java @@ -37,7 +37,8 @@ public enum VCFHeaderVersion { VCF4_0("VCFv4.0", "fileformat"), VCF4_1("VCFv4.1", "fileformat"), VCF4_2("VCFv4.2", "fileformat"), - VCF4_3("VCFv4.3", "fileformat"); + VCF4_3("VCFv4.3", "fileformat"), + VCF4_4("VCFv4.4", "fileformat"); private final String versionString; private final String formatString; diff --git a/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java b/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java index d848209d99..e9e91a147b 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java @@ -3,8 +3,6 @@ import com.google.common.jimfs.Configuration; import com.google.common.jimfs.Jimfs; import htsjdk.HtsjdkTest; -import htsjdk.samtools.seekablestream.SeekableStream; -import htsjdk.samtools.seekablestream.SeekableStreamFactory; import htsjdk.samtools.util.IOUtil; import htsjdk.tribble.TestUtils; import org.testng.Assert; @@ -50,6 +48,10 @@ Object[][] pathsData() { // various ways to refer to a local file {TEST_DATA_DIR + "VCF4HeaderTest.vcf", null, false, true}, + // this file is the same as VCF4HeaderTest.vcf, except the header is marked as VCF 4.4 + // this test is expected to fail here, since it requires the "lenient_vcf_4_4" property to be set + {TEST_DATA_DIR + "VCF4_4HeaderTest.vcf", null, false, false}, + // // this is almost a vcf, but not quite it's missing the #CHROM line and it has no content... {TEST_DATA_DIR + "Homo_sapiens_assembly38.tile_db_header.vcf", null, false, false}, @@ -102,6 +104,16 @@ public void testCanOpenVCFPathReader(final String file, final String index, fina Assert.assertTrue(shouldSucceed, "Test should have failed but succeeded"); } + @Test(groups = "lenient_vcf_4_4") + public void testAcceptLenientVCF4_4() { + // this file is the same as VCF4HeaderTest.vcf, except the header is marked as VCF 4.4 + // this will fail if the ", "lenient_vcf_4_4" property isn't set + try (final VCFFileReader reader = new VCFFileReader(Paths.get(TEST_DATA_DIR + "VCF4_4HeaderTest.vcf"), false)) { + final VCFHeader header = reader.getFileHeader(); + Assert.assertEquals(header.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_3); + } + } + @Test public void testTabixFileWithEmbeddedSpaces() throws IOException { final File testVCF = new File(TEST_DATA_DIR, "HiSeq.10000.vcf.bgz"); diff --git a/src/test/resources/htsjdk/variant/VCF4_4HeaderTest.vcf b/src/test/resources/htsjdk/variant/VCF4_4HeaderTest.vcf new file mode 100644 index 0000000000..9e79b73b7b --- /dev/null +++ b/src/test/resources/htsjdk/variant/VCF4_4HeaderTest.vcf @@ -0,0 +1,42 @@ +##fileformat=VCFv4.4 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= 0.06"> +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##UnifiedGenotyper="analysis_type=UnifiedGenotyper input_file=[/humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-23/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-24/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-5/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-9/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-6/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-19/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-25/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-4/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-14/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-22/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-2/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-3/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-7/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-16/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-1/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-17/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-8/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-10/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-18/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-20/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-11/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-15/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-21/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-12/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-13/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam] read_buffer_size=null read_filter=[] intervals=[chrX] excludeIntervals=[chrM, chrY] reference_sequence=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta rodBind=[dbsnp,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod, interval,Intervals,chrX] rodToIntervalTrackName=null BTI_merge_rule=UNION DBSNP=/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod hapmap=null hapmap_chip=null out=null err=null outerr=null filterZeroMappingQualityReads=false downsampling_type=NONE downsample_to_fraction=null downsample_to_coverage=null useOriginalQualities=false validation_strictness=SILENT unsafe=null max_reads_at_locus=10000 num_threads=1 interval_merging=ALL read_group_black_list=null genotype_model=JOINT_ESTIMATE base_model=EMPIRICAL heterozygosity=7.8E-4 genotype=false output_all_callable_bases=false standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=10.0 trigger_min_confidence_threshold_for_calling=30.0 trigger_min_confidence_threshold_for_emitting=30.0 noSLOD=false assume_single_sample_reads=null platform=null min_base_quality_score=20 min_mapping_quality_score=20 max_mismatches_in_40bp_window=3 use_reads_with_bad_mates=false max_deletion_fraction=0.05 cap_base_quality_by_mapping_quality=false" +##VariantFiltration="analysis_type=VariantFiltration input_file=[] read_buffer_size=null read_filter=[] intervals=null excludeIntervals=[chrM, chrY] reference_sequence=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta rodBind=[variant,VCF,wgs.v9/HiSeq.WGS.cleaned.ug.snpfiltered.vcf, mask,Bed,wgs.v9/HiSeq.WGS.cleaned.indels.10.mask] rodToIntervalTrackName=null BTI_merge_rule=UNION DBSNP=null hapmap=null hapmap_chip=null out=wgs.v9/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf err=null outerr=null filterZeroMappingQualityReads=false downsampling_type=NONE downsample_to_fraction=null downsample_to_coverage=null useOriginalQualities=false validation_strictness=SILENT unsafe=null max_reads_at_locus=2147483647 num_threads=1 interval_merging=ALL read_group_black_list=null filterExpression=[] filterName=[] genotypeFilterExpression=[] genotypeFilterName=[] clusterSize=3 clusterWindowSize=0 maskName=Indel NO_HEADER=false" +##VariantFiltration="analysis_type=VariantFiltration input_file=[] read_buffer_size=null read_filter=[] intervals=null excludeIntervals=[chrM, chrY] reference_sequence=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta rodBind=[variant,VCF,wgs.v9/HiSeq.WGS.cleaned.ug.vcf] rodToIntervalTrackName=null BTI_merge_rule=UNION DBSNP=null hapmap=null hapmap_chip=null out=wgs.v9/HiSeq.WGS.cleaned.ug.snpfiltered.vcf err=null outerr=null filterZeroMappingQualityReads=false downsampling_type=NONE downsample_to_fraction=null downsample_to_coverage=null useOriginalQualities=false validation_strictness=SILENT unsafe=null max_reads_at_locus=2147483647 num_threads=1 interval_merging=ALL read_group_black_list=null filterExpression=[QUAL < 50.0, MQ0 >= 4 && ((MQ0 / (1.0 * DP)) > 0.1), AB > 0.75 && DP > 40, DP > 120 || SB > -0.10] filterName=[LowQual, HARD_TO_VALIDATE, ABFilter, DPFilter] genotypeFilterExpression=[] genotypeFilterName=[] clusterSize=3 clusterWindowSize=10 maskName=Mask NO_HEADER=false" +##source=VariantOptimizer +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 +chr1 109 . A T 0 FDRtranche2.00to10.00+ AC=1;AF=0.50;AN=2;DP=1019;Dels=0.00;HRun=0;HaplotypeScore=686.65;MQ=19.20;MQ0=288;OQ=2175.54;QD=2.13;SB=-1042.18 GT:AD:DP:GL:GQ 0/1:610,327:308:-316.30,-95.47,-803.03:99