Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement reading qvd files from Python IO #27

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 23 additions & 7 deletions qvd/qvd_reader.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,29 @@
from .qvd import read_qvd
from .qvd import read_qvd, read_qvd_from_buffer
import pandas as pd
import io


def read(file_name):
data = read_qvd(file_name)
df = pd.DataFrame.from_dict(data)
def read(file):
data_dict = read_to_dict(file)
df = pd.DataFrame.from_dict(data_dict)
return df


def read_to_dict(file_name):
data = read_qvd(file_name)
return data
def read_to_dict(file):
if (isinstance(file, io.TextIOBase)
or isinstance(file, io.BufferedIOBase)
or isinstance(file, io.RawIOBase)
or isinstance(file, io.IOBase)):
try:
unpacked_data = file.read()
except UnicodeDecodeError as e:
raise Exception("Supply a raw file access. Use mode \"rb\" instead of mode \"r\"")
elif isinstance(file, bytes):
unpacked_data = file
elif isinstance(file, str):
return read_qvd(file)
else:
raise Exception("Please supply a raw string or a file")
result_data = read_qvd_from_buffer(unpacked_data)
return result_data

8 changes: 8 additions & 0 deletions qvd/test_qvd_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,11 @@ def test_read_size(self):
qvd = qvd_reader.read(f'{os.path.dirname(__file__)}/test_files/AAPL.qvd')
csv = pd.read_csv(f'{os.path.dirname(__file__)}/test_files/AAPL.csv', float_precision='round_trip')
assert np.array_equal(np.sort(qvd.columns, axis=0), np.sort(csv.columns, axis=0))

def test_qvd_from_in_memory(self):
with open(f'{os.path.dirname(__file__)}/test_files/AAPL.qvd', 'rb') as fin:
qvd = qvd_reader.read(fin)
csv = pd.read_csv(f'{os.path.dirname(__file__)}/test_files/AAPL.csv', float_precision='round_trip')
assert qvd.shape == csv.shape
assert np.array_equal(np.sort(qvd.columns, axis=0), np.sort(csv.columns, axis=0))

61 changes: 51 additions & 10 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use pyo3::{prelude::*, types::PyDict};
use quick_xml::de::from_str;
use qvd_structure::{QvdFieldHeader, QvdTableHeader};
use std::io::SeekFrom;
use std::io::{self, Read};
use std::io::{self, Read, Cursor};
use std::path::Path;
use std::str;
use std::{collections::HashMap, fs::File};
Expand All @@ -14,6 +14,7 @@ pub mod qvd_structure;
#[pymodule]
fn qvd(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(read_qvd, m)?)?;
m.add_function(wrap_pyfunction!(read_qvd_from_buffer, m)?)?;

Ok(())
}
Expand Down Expand Up @@ -49,6 +50,36 @@ fn read_qvd(py: Python, file_name: String) -> PyResult<Py<PyDict>> {
Ok(dict.into())
}

#[pyfunction]
fn read_qvd_from_buffer(py: Python, input_buffer: Vec<u8>) -> PyResult<Py<PyDict>> {
let xml: String = get_xml_data_from_raw_data(&input_buffer).expect("Error reading qvd data");
let dict = PyDict::new(py);
let binary_section_offset = xml.as_bytes().len();

let qvd_structure: QvdTableHeader = from_str(&xml).unwrap();
let mut symbol_map: HashMap<String, Vec<Option<String>>> = HashMap::new();

// Seek to the end of the XML section
let buf = &input_buffer[binary_section_offset..];
let rows_start = qvd_structure.offset;
let rows_end = buf.len();
let rows_section = &buf[rows_start..rows_end];
let record_byte_size = qvd_structure.record_byte_size;

for field in qvd_structure.fields.headers {
symbol_map.insert(
field.field_name.clone(),
get_symbols_as_strings(&buf, &field),
);
let symbol_indexes = get_row_indexes(&rows_section, &field, record_byte_size);
let column_values =
match_symbols_with_indexes(&symbol_map[&field.field_name], &symbol_indexes);
dict.set_item(field.field_name, column_values).unwrap();
}
Ok(dict.into())
}


fn read_qvd_to_buf(mut f: File, binary_section_offset: usize) -> Vec<u8> {
f.seek(SeekFrom::Start(binary_section_offset as u64))
.unwrap();
Expand Down Expand Up @@ -177,23 +208,33 @@ fn bitslice_to_vec(bitslice: &BitSlice<Msb0, u8>) -> Vec<u8> {
v
}

fn extract_xml_data(reader: &mut dyn io::BufRead) -> Result<String, io::Error> {
let mut buffer = Vec::new();
// There is a line break, carriage return and a null terminator between the XML and data
// Find the null terminator
reader
.read_until(0, &mut buffer)
.expect("Failed to find null terminator in QVD");
let xml_string =
str::from_utf8(&buffer[..]).expect("xml section contains invalid UTF-8 chars");
Ok(xml_string.to_owned())
}

fn get_xml_data(file_name: &str) -> Result<String, io::Error> {
match read_file(file_name) {
Ok(mut reader) => {
let mut buffer = Vec::new();
// There is a line break, carriage return and a null terminator between the XMl and data
// Find the null terminator
reader
.read_until(0, &mut buffer)
.expect("Failed to read file");
let xml_string =
str::from_utf8(&buffer[..]).expect("xml section contains invalid UTF-8 chars");
Ok(xml_string.to_owned())
extract_xml_data(&mut reader)
}
Err(e) => Err(e),
}
}

fn get_xml_data_from_raw_data(raw_data: &Vec<u8>) -> Result<String, io::Error> {
let mut cursor = Cursor::new(raw_data);
extract_xml_data(&mut cursor)
}


fn read_file<P>(filename: P) -> io::Result<io::BufReader<File>>
where
P: AsRef<Path>,
Expand Down