diff --git a/sgkit/io/vcf/vcf_reader.py b/sgkit/io/vcf/vcf_reader.py index f1da036d3..c66a00db2 100644 --- a/sgkit/io/vcf/vcf_reader.py +++ b/sgkit/io/vcf/vcf_reader.py @@ -525,6 +525,10 @@ def vcf_to_zarr_sequential( ds.attrs["filters"] = filters ds.attrs["vcf_zarr_version"] = "0.1" ds.attrs["vcf_header"] = vcf.raw_header + try: + ds.attrs["contig_lengths"] = vcf.seqlens + except AttributeError: + pass for field_handler in field_handlers: field_handler.update_dataset(ds) diff --git a/sgkit/tests/io/vcf/test_vcf_reader.py b/sgkit/tests/io/vcf/test_vcf_reader.py index 661b5f2fe..eb5483fed 100644 --- a/sgkit/tests/io/vcf/test_vcf_reader.py +++ b/sgkit/tests/io/vcf/test_vcf_reader.py @@ -32,6 +32,7 @@ def test_vcf_to_zarr__small_vcf(shared_datadir, is_path, tmp_path): ds = xr.open_zarr(output) assert ds.attrs["contigs"] == ["19", "20", "X"] + assert "contig_lengths" not in ds.attrs assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2]) assert_array_equal( ds["variant_position"], @@ -155,6 +156,8 @@ def test_vcf_to_zarr__large_vcf(shared_datadir, is_path, tmp_path): vcf_to_zarr(path, output, chunk_length=5_000) ds = xr.open_zarr(output) + assert ds.attrs["contigs"] == ["20", "21"] + assert ds.attrs["contig_lengths"] == [63025520, 48129895] assert ds["sample_id"].shape == (1,) assert ds["call_genotype"].shape == (19910, 1, 2) assert ds["call_genotype_mask"].shape == (19910, 1, 2) diff --git a/sgkit/tests/io/vcf/test_vcf_roundtrip.py b/sgkit/tests/io/vcf/test_vcf_roundtrip.py index 7f591857c..e0c628688 100644 --- a/sgkit/tests/io/vcf/test_vcf_roundtrip.py +++ b/sgkit/tests/io/vcf/test_vcf_roundtrip.py @@ -199,6 +199,9 @@ def test_all_fields( assert allel_ds_contigs <= sg_ds_contigs del allel_ds.attrs["contigs"] del sg_ds.attrs["contigs"] + # scikit-allel doesn't store contig lengths + if "contig_lengths" in sg_ds.attrs: + del sg_ds.attrs["contig_lengths"] if allel_ds_contigs < sg_ds_contigs: # variant_contig variables are not comparable, so remove them before comparison