Skip to content

Commit

Permalink
cleaned up rawhash python wrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
maximilianmordig committed Jun 4, 2024
1 parent e97d2fd commit 4d6dbf5
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 81 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ build/
run_dir/
.venv/
example_out/
__pycache__/

test/data/d1_*
test/data/d2_*
Expand Down
101 changes: 24 additions & 77 deletions python_wrapper/example.py
Original file line number Diff line number Diff line change
@@ -1,97 +1,47 @@
#%%
from pathlib import Path
import cppyy
import os

# os.chdir("/home/mmordig/rawhash_project/rawhash2_new/build/extern")
# Make sure to compile rawhash properly, both the CLI and the shared lib!!! Otherwise, the kernel may crash

header_file = Path("/home/mmordig/rawhash_project/rawhash2_new/src/rawhash_wrapper.hpp")
library_file = Path("/home/mmordig/rawhash_project/rawhash2_new/build/src/librawhash2_wrapper.so")
cppyy.include(str(header_file))
# cppyy.add_library_path("/home/mmordig/rawhash_project/rawhash2_new/build/extern/hdf5/lib")
cppyy.add_library_path("/home/mmordig/rawhash_project/rawhash2_new/build/src") # for shared libs
cppyy.load_library("librawhash2_wrapper")
import sys
sys.path.extend(["/home/mmordig/rawhash_project/rawhash2_new/python_wrapper"])
from rawhash_wrapper import *

# cppyy.add_library_path(str(library_file.parent))
# cppyy.load_library(str(library_file.name))
# cppyy.load_library(str(library_file))

# cppyy.gbl.RawHashMapper(5, [b"hello"])
args = [
#%%
rawhash_args = [
"-x", "sensitive", "-t", "8", "-p",
"/home/mmordig/rawhash_project/rawhash2_new/extern/kmer_models/legacy/legacy_r9.4_180mv_450bps_6mer/template_median68pA.model",
]
# args += [
# rawhash_args += [
# "/home/mmordig/rawhash_project/rawhash2_new/test/data/d2_ecoli_r94/ref.fa",
# "no-revcomp-query",
# ]
args += [
rawhash_args += [
"/home/mmordig/rawhash_project/rawhash2_new/example_out/forwrev_rawhash2_sensitive.ind",
]
# forwrev_rawhash2_sensitive.paf
# 0006a106-77eb-4752-b405-38d1635718fa
# /home/mmordig/rawhash_project/rawhash2_new/example_out/forwonly_rawhash2_sensitive.ind
# /home/mmordig/rawhash_project/rawhash2_new/example_out/forwrev_rawhash2_sensitive.ind
args = ["my_dummy_program"] + args
mapper = cppyy.gbl.RawHashMapper(len(args), args)
mapper.idx_info()


#%%

slow5_filename = "/home/mmordig/rawhash_project/rawhash2_new/test/data/d2_ecoli_r94/small_slow5_files/barcode02_r0barcode02b0_0.blow5"
# we know rawhash finds an alignment since from the file /home/mmordig/rawhash_project/rawhash2_new/example_out/forwrev_rawhash2_sensitive.paf
# we know rawhash finds an alignment for this read from the file /home/mmordig/rawhash_project/rawhash2_new/example_out/forwrev_rawhash2_sensitive.paf
read_id = "0006a106-77eb-4752-b405-38d1635718fa"
import contextlib
import pyslow5
s5 = pyslow5.Open(slow5_filename, "r")
raw_signal = s5.get_read(read_id, pA=True)["signal"]
raw_signal = raw_signal[(raw_signal > 30.) & (raw_signal < 200.)]
# with contextlib.autoclosing(pyslow5.Open(slow5_filename, "r")) as s5:
raw_signal.shape
#%%
import numpy as np

# Create a numpy array
# arr = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
# # Ensure the numpy array is contiguous
# arr = np.ascontiguousarray(arr, dtype=np.float32)

# arr = raw_signal.astype(np.float32)
arr = np.ascontiguousarray(raw_signal, dtype=np.float32)

import cppyy.ll
# cppyy.set_debug()
# Call the C++ function with the numpy array
alignments = mapper.map(cppyy.ll.cast["float*"](arr.ctypes.data), len(arr))

class Alignment:
def __init__(self, contig, ref_start, ref_end, is_pos_strand):
self.contig = contig
self.ref_start = ref_start
self.ref_end = ref_end
self.is_pos_strand = is_pos_strand
@staticmethod
def from_cppyy(alignment):
return Alignment(
contig=alignment.contig,
ref_start=alignment.ref_start,
ref_end=alignment.ref_end,
is_pos_strand=alignment.is_pos_strand,
)
def __repr__(self):
return f"Alignment(contig: {self.contig}, start: {self.ref_start}, end: {self.ref_end}, pos_strand: {self.is_pos_strand})"

def parse_paf_line(line):
"""returns read_id, alignment"""
fields = line.strip().split("\t")
return fields[0], Alignment(
contig=fields[5],
ref_start=int(fields[7]),
ref_end=int(fields[8]),
is_pos_strand=fields[4] == "+",
)
# def format_alignment(alignment):
# # return alignment
# return f"Alignment(contig: {alignment.contig}, start: {alignment.ref_start}, end: {alignment.ref_end}, pos_strand: {alignment.is_pos_strand})"
raw_signal = get_read_signal(slow5_filename, read_id)
print(f"Raw signal shape: {raw_signal.shape}")

rawhash_lib_dir = Path("/home/mmordig/rawhash_project/rawhash2_new")
load_rawhash_wrapper_lib(rawhash_lib_dir)
mapper = get_rawhash_mapper(rawhash_args)
mapper.print_idx_info()
raw_signal = prepare_signal_for_rawhash(raw_signal)
# raw_signal = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
print(f"Raw signal shape: {raw_signal.shape}")
alignments = get_rawhash_alignments(mapper, raw_signal)
#%%

for (i, alignment) in enumerate(alignments):
print(f"Alignment {i}: {Alignment.from_cppyy(alignment)}")
Expand All @@ -101,9 +51,6 @@ def parse_paf_line(line):
for line in f:
rid, gt_alignment = parse_paf_line(line)
if rid == read_id:
print(f"GT Alignment for {rid}: {gt_alignment}")
print(f"GT Alignment for {rid}:\n {gt_alignment}")
break
#%%
1

# --rev-query
72 changes: 72 additions & 0 deletions python_wrapper/rawhash_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""
Import this file if you want to use rawhash2 in Python.
"""

from pathlib import Path
from typing import List, TypeVar
import cppyy
import cppyy.ll
import contextlib
import pyslow5
import numpy as np

def get_read_signal(slow5_filename, read_id: str):
with contextlib.closing(pyslow5.Open(slow5_filename, "r")) as s5:
return s5.get_read(read_id, pA=True)["signal"]

def prepare_signal_for_rawhash(raw_signal: np.ndarray[float]):
"""
Remove outliers, convert to numpy
raw_signal should be floats (with offset, range, digitisation scaling)
"""
raw_signal = raw_signal[(raw_signal > 30.) & (raw_signal < 200.)]
return raw_signal

class Alignment:
def __init__(self, contig, ref_start, ref_end, is_pos_strand):
self.contig = contig
self.ref_start = ref_start
self.ref_end = ref_end
self.is_pos_strand = is_pos_strand
@staticmethod
def from_cppyy(alignment):
return Alignment(
contig=alignment.contig,
ref_start=alignment.ref_start,
ref_end=alignment.ref_end,
is_pos_strand=alignment.is_pos_strand,
)
def __repr__(self):
return f"Alignment(contig: {self.contig}, start: {self.ref_start}, end: {self.ref_end}, pos_strand: {self.is_pos_strand})"

def load_rawhash_wrapper_lib(rawhash_lib_dir: Path):
rawhash_lib_dir = Path(rawhash_lib_dir)
header_file = rawhash_lib_dir / "src/rawhash_wrapper.hpp"
# library_file = rawhash_lib_dir / "build/src/librawhash2_wrapper.so"
cppyy.include(str(header_file))
cppyy.add_library_path("/home/mmordig/rawhash_project/rawhash2_new/build/src") # for shared libs
cppyy.load_library("librawhash2_wrapper")

# define type RawHashMapper

RawHashMapper = TypeVar("RawHashMapper")
def get_rawhash_mapper(rawhash_args: List[str]) -> RawHashMapper:
args = ["my_dummy_program"] + rawhash_args
mapper = cppyy.gbl.RawHashMapper(len(args), args)
return mapper

def get_rawhash_alignments(mapper: RawHashMapper, raw_signal: np.ndarray[float]) -> List[Alignment]:
raw_signal = np.ascontiguousarray(raw_signal, dtype=np.float32)
alignments = mapper.map(cppyy.ll.cast["float*"](raw_signal.ctypes.data), len(raw_signal))
# return alignments
return [Alignment.from_cppyy(alignment) for alignment in alignments]

def parse_paf_line(line):
"""returns read_id, alignment"""
fields = line.strip().split("\t")
return fields[0], Alignment(
contig=fields[5],
ref_start=int(fields[7]),
ref_end=int(fields[8]),
is_pos_strand=fields[4] == "+",
)
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ set_alternate_linker(mold)
# todo: currently can only compile rawhash2 or the below, not both
# this happens because functions like setup_hdf5 define the hdf5 target and the relationship to the target name
# rather than doing it separately
set(build_cli OFF)
set(build_cli ON)

if (build_cli)
message(STATUS "Building CLI")
Expand Down
2 changes: 1 addition & 1 deletion src/rawhash_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ RawHashMapper::~RawHashMapper() {
delete _opt;
}

void RawHashMapper::idx_info() const {
void RawHashMapper::print_idx_info() const {
fprintf(stderr, "[M::%s] index info\n", __func__); // todo
ri_idx_stat((ri_idx_t*)_ri);
}
Expand Down
2 changes: 1 addition & 1 deletion src/rawhash_wrapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class RawHashMapper {
/**
* Get info about the index
*/
void idx_info() const;
void print_idx_info() const;

private:
void *_ri; // ri_idx_t*
Expand Down
2 changes: 1 addition & 1 deletion src/wrapper_example.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@

int main(int argc, char *argv[]) {
RawHashMapper mapper(argc, argv);
mapper.idx_info();
mapper.print_idx_info();
return 0;
}

0 comments on commit 4d6dbf5

Please sign in to comment.