Skip to content

Commit

Permalink
Merge pull request #233 from endast/232-vcf-generation-slow-when-usin…
Browse files Browse the repository at this point in the history
…g-reference

Speed up reference usage and bump version
  • Loading branch information
endast authored Apr 5, 2024
2 parents 5729252 + 28070ee commit a07f802
Show file tree
Hide file tree
Showing 9 changed files with 13 additions and 11 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ By default `fake-vcf` writes to stdout
```shell
poetry run fake-vcf generate -s 2 -r 2
##fileformat=VCFv4.2
##source=VCFake 0.2.0
##source=VCFake 0.2.1
##FILTER=<ID=PASS,Description="All filters passed">
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##contig=<ID=chr1>
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
project = "fake-vcf"
copyright = "2023, Magnus Wahlberg"
author = "Magnus Wahlberg"
version = "0.2.0"
version = "0.2.1"

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
Expand Down
2 changes: 1 addition & 1 deletion docs/source/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ By default `fake-vcf` writes to stdout
poetry run fake-vcf generate -s 2 -r 2
##fileformat=VCFv4.2
##source=VCFake 0.2.0
##source=VCFake 0.2.1
##FILTER=<ID=PASS,Description="All filters passed">
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##contig=<ID=chr1>
Expand Down
4 changes: 3 additions & 1 deletion fake_vcf/vcf_faker.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,9 @@ def __init__(

self.reference_data = None
if self.reference_dir:
self.reference_data = vcf_reference.load_reference_data(self.reference_file)
self.reference_data = vcf_reference.load_reference_data(
self.reference_file, memory_map=False
)
if self.reference_data.shape[0] < max(self.positions):
raise ValueError(
f"""Max position size {max(self.positions)} is outside the reference which has a max of {len(self.reference_data)}"""
Expand Down
6 changes: 3 additions & 3 deletions fake_vcf/vcf_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
METADATA_FILE_NAME = "sequence_metadata.json"


def get_ref_at_pos(ref_data, position):
reference_value = ref_data.take([position])[0][0].as_py()
def get_ref_at_pos(ref_data: pa.array, position):
reference_value = ref_data.column(0)[position].as_py()
return reference_value


def load_reference_data(reference_file, memory_map=True):
def load_reference_data(reference_file, memory_map):
reference_data = pq.read_table(reference_file, memory_map=memory_map)
return reference_data

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "fake-vcf"
version = "0.2.0"
version = "0.2.1"
description = "A fake vcf file generator "
readme = "README.md"
authors = ["fake-vcf <[email protected]>"]
Expand Down
2 changes: 1 addition & 1 deletion tests/test_data/chr1_small.vcf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
##fileformat=VCFv4.2
##source=VCFake 0.2.0
##source=VCFake 0.1.0
##FILTER=<ID=PASS,Description="All filters passed">
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##contig=<ID=chr1>
Expand Down
2 changes: 1 addition & 1 deletion tests/test_data/chr2_small.vcf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
##fileformat=VCFv4.2
##source=VCFake 0.2.0
##source=VCFake 0.2.1
##FILTER=<ID=PASS,Description="All filters passed">
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##contig=<ID=chr2>
Expand Down
2 changes: 1 addition & 1 deletion tests/test_data/reference/parquet/sequence_metadata.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"source_reference_file": "test-reference.fa",
"fake-vcf-version": "0.2.0",
"fake-vcf-version": "0.2.1",
"reference_files": {
"chr1": "fasta_chr1.parquet",
"chr2": "fasta_chr2.parquet",
Expand Down

0 comments on commit a07f802

Please sign in to comment.