Skip to content

Commit

Permalink
optimize vacuum operation (#258)
Browse files Browse the repository at this point in the history
* optimize vacuum operation

Avoid repeatedly creating strings and vectors at runtime.

Turn O(n) path lookup into O(1) with Hashset.

Early exit in stale file check loop.

* rename table uri to table path everywhere for consistency

* change path in public methods to uri

* bump python version
  • Loading branch information
QP Hou authored Jun 2, 2021
1 parent e48336d commit d922b31
Show file tree
Hide file tree
Showing 17 changed files with 371 additions and 196 deletions.
4 changes: 2 additions & 2 deletions python/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "deltalake-python"
version = "0.4.9"
version = "0.5.0"
authors = ["Qingping Hou <[email protected]>"]
homepage = "https://github.com/delta-io/delta-rs"
license = "Apache-2.0"
Expand Down Expand Up @@ -54,4 +54,4 @@ requires-dist = [
"sphinx-rtd-theme; extra == 'devel'",
"toml; extra == 'devel'",
]
provides-extra = ["pandas", "devel"]
provides-extra = ["pandas", "devel"]
4 changes: 2 additions & 2 deletions python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ format: ## Format the code
$(info --- Rust format ---)
cargo fmt
$(info --- Python format ---)
black .
black deltalake tests *.py
isort .

.PHONY: check-rust
Expand Down Expand Up @@ -66,4 +66,4 @@ clean: ## Run clean

.PHONY: help
help:
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
26 changes: 20 additions & 6 deletions python/deltalake/table.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import warnings
from dataclasses import dataclass
from typing import Any, List, Optional, Tuple
from urllib.parse import urlparse
Expand Down Expand Up @@ -61,15 +62,15 @@ def __str__(self) -> str:
class DeltaTable:
"""Create a DeltaTable instance."""

def __init__(self, table_path: str, version: Optional[int] = None):
def __init__(self, table_uri: str, version: Optional[int] = None):
"""
Create the Delta Table from a path with an optional version.
Multiple StorageBackends are currently supported: AWS S3, Azure Data Lake Storage Gen2 and local URI.
:param table_path: the path of the DeltaTable
:param table_uri: the path of the DeltaTable
:param version: version of the DeltaTable
"""
self._table = RawDeltaTable(table_path, version=version)
self._table = RawDeltaTable(table_uri, version=version)
self._metadata = Metadata(self._table)

def version(self) -> int:
Expand Down Expand Up @@ -123,9 +124,22 @@ def file_paths(self) -> List[str]:
"""
Get the list of files with an absolute path.
:return: list of the .parquet files with an absolute path referenced for the current version of the DeltaTable
:return: list of the .parquet files with an absolute URI referenced for the current version of the DeltaTable
"""
return self._table.file_paths()
warnings.warn(
"Call to deprecated method file_paths. Please use file_uris instead.",
category=DeprecationWarning,
stacklevel=2,
)
return self.file_uris()

def file_uris(self) -> List[str]:
"""
Get the list of files with an absolute path.
:return: list of the .parquet files with an absolute URI referenced for the current version of the DeltaTable
"""
return self._table.file_uris()

def load_version(self, version: int) -> None:
"""
Expand Down Expand Up @@ -182,7 +196,7 @@ def to_pyarrow_dataset(
:return: the PyArrow dataset in PyArrow
"""
if partitions is None:
file_paths = self._table.file_paths()
file_paths = self._table.file_uris()
else:
file_paths = self._table.files_by_partitions(partitions)
paths = [urlparse(curr_file) for curr_file in file_paths]
Expand Down
18 changes: 8 additions & 10 deletions python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,17 @@ struct RawDeltaTableMetaData {
#[pymethods]
impl RawDeltaTable {
#[new]
fn new(table_path: &str, version: Option<deltalake::DeltaDataTypeLong>) -> PyResult<Self> {
fn new(table_uri: &str, version: Option<deltalake::DeltaDataTypeLong>) -> PyResult<Self> {
let table = match version {
None => rt()?.block_on(deltalake::open_table(table_path)),
Some(version) => {
rt()?.block_on(deltalake::open_table_with_version(table_path, version))
}
None => rt()?.block_on(deltalake::open_table(table_uri)),
Some(version) => rt()?.block_on(deltalake::open_table_with_version(table_uri, version)),
}
.map_err(PyDeltaTableError::from_raw)?;
Ok(RawDeltaTable { _table: table })
}

pub fn table_path(&self) -> PyResult<&str> {
Ok(&self._table.table_path)
pub fn table_uri(&self) -> PyResult<&str> {
Ok(&self._table.table_uri)
}

pub fn version(&self) -> PyResult<i64> {
Expand Down Expand Up @@ -121,7 +119,7 @@ impl RawDeltaTable {
match partition_filters {
Ok(filters) => Ok(self
._table
.get_file_paths_by_partitions(&filters)
.get_file_uris_by_partitions(&filters)
.map_err(PyDeltaTableError::from_raw)?),
Err(err) => Err(PyDeltaTableError::from_raw(err)),
}
Expand All @@ -135,8 +133,8 @@ impl RawDeltaTable {
.collect())
}

pub fn file_paths(&self) -> PyResult<Vec<String>> {
Ok(self._table.get_file_paths())
pub fn file_uris(&self) -> PyResult<Vec<String>> {
Ok(self._table.get_file_uris())
}

pub fn schema_json(&self) -> PyResult<String> {
Expand Down
6 changes: 3 additions & 3 deletions ruby/spec/deltalake_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@

describe Deltalake do
describe '#open_table' do
let(:table_path) do
let(:table_uri) do
File.expand_path('../rust/tests/data/simple_table')
end

subject(:table) { Deltalake.open_table(table_path) }
subject(:table) { Deltalake.open_table(table_uri) }

its(:table_path) { should eq(table_path) }
its(:table_uri) { should eq(table_uri) }
its(:version) { should eq 4 }

describe '#files' do
Expand Down
26 changes: 13 additions & 13 deletions ruby/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,23 @@ use rutie::{AnyObject, Array, Class, Integer, Object, RString};
use std::sync::Arc;

pub struct TableData {
table_path: String,
table_uri: String,
actual: Arc<DeltaTable>,
}

impl TableData {
fn new(table_path: String) -> Self {
println!("initializing with {}", table_path);
fn new(table_uri: String) -> Self {
println!("initializing with {}", table_uri);

let rt = tokio::runtime::Runtime::new().unwrap();
let table = rt.block_on(deltalake::open_table(&table_path)).unwrap();
let table = rt.block_on(deltalake::open_table(&table_uri)).unwrap();
let actual = Arc::new(table);

Self { table_path, actual }
Self { table_uri, actual }
}

fn table_path(&self) -> &str {
&self.table_path
fn table_uri(&self) -> &str {
&self.table_uri
}

fn version(&self) -> i64 {
Expand All @@ -51,15 +51,15 @@ class!(Table);
methods!(
Table,
rtself,
fn ruby_table_new(table_path: RString) -> AnyObject {
let table_data = TableData::new(table_path.unwrap().to_string());
fn ruby_table_new(table_uri: RString) -> AnyObject {
let table_data = TableData::new(table_uri.unwrap().to_string());

Class::from_existing("Table").wrap_data(table_data, &*TABLE_DATA_WRAPPER)
},
fn ruby_table_path() -> RString {
let table_path = rtself.get_data(&*TABLE_DATA_WRAPPER).table_path();
fn ruby_table_uri() -> RString {
let table_uri = rtself.get_data(&*TABLE_DATA_WRAPPER).table_uri();

RString::new_utf8(table_path)
RString::new_utf8(table_uri)
},
fn ruby_version() -> Integer {
let version = rtself.get_data(&*TABLE_DATA_WRAPPER).version();
Expand Down Expand Up @@ -87,7 +87,7 @@ pub extern "C" fn Init_table() {
Class::new("Table", Some(&data_class)).define(|klass| {
klass.def_self("new", ruby_table_new);

klass.def("table_path", ruby_table_path);
klass.def("table_uri", ruby_table_uri);
klass.def("version", ruby_version);
klass.def("files", ruby_files);
});
Expand Down
8 changes: 8 additions & 0 deletions rust/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,11 @@ Optional cargo package features
- `s3` - enable the S3 storage backend to work with Delta Tables in AWS S3.
- `azure` - enable the Azure storage backend to work with Delta Tables in Azure Data Lake Storage Gen2 accounts.
- `datafusion-ext` - enable the `datafusion::datasource::TableProvider` trait implementation for Delta Tables, allowing them to be queried using [DataFusion](https://github.com/apache/arrow/tree/master/rust/datafusion).


Development
-----------

To run s3 integration tests from local machine, we use docker-compose to stand
up AWS local stack. To spin up the test environment run `docker-compose up` in
the root of the `delta-rs` repo.
29 changes: 13 additions & 16 deletions rust/src/bin/delta-inspect.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@ async fn main() -> anyhow::Result<()> {
App::new("info")
.about("dump table metadata info")
.setting(AppSettings::ArgRequiredElseHelp)
.args(&[Arg::new("path").about("Table path").required(true)]),
.args(&[Arg::new("uri").about("Table URI").required(true)]),
)
.subcommand(
App::new("files")
.setting(AppSettings::ArgRequiredElseHelp)
.about("output list of files for a given version, defalt to latest")
.args(&[
Arg::new("path").about("Table path").required(true),
Arg::new("full_path")
.about("Display files in full path")
Arg::new("uri").about("Table URI").required(true),
Arg::new("full_uri")
.about("Display files in full URI")
.takes_value(false)
.long("full-path")
.long("full-uri")
.short('f'),
Arg::new("version")
.takes_value(true)
Expand All @@ -40,29 +40,26 @@ async fn main() -> anyhow::Result<()> {

match matches.subcommand() {
Some(("files", files_matches)) => {
let table_path = files_matches.value_of("path").unwrap();
let table_uri = files_matches.value_of("uri").unwrap();

let table = match files_matches.value_of_t::<i64>("version") {
Ok(v) => deltalake::open_table_with_version(table_path, v).await?,
Ok(v) => deltalake::open_table_with_version(table_uri, v).await?,
Err(clap::Error {
kind: clap::ErrorKind::ArgumentNotFound,
..
}) => deltalake::open_table(table_path).await?,
}) => deltalake::open_table(table_uri).await?,
Err(e) => e.exit(),
};

if files_matches.is_present("full_path") {
table
.get_file_paths()
.iter()
.for_each(|f| println!("{}", f));
if files_matches.is_present("full_uri") {
table.get_file_uris().iter().for_each(|f| println!("{}", f));
} else {
table.get_files().iter().for_each(|f| println!("{}", f));
table.get_files_iter().for_each(|f| println!("{}", f));
};
}
Some(("info", info_matches)) => {
let table_path = info_matches.value_of("path").unwrap();
let table = deltalake::open_table(table_path).await?;
let table_uri = info_matches.value_of("uri").unwrap();
let table = deltalake::open_table(table_uri).await?;
println!("{}", table);
}
_ => unreachable!(),
Expand Down
Loading

0 comments on commit d922b31

Please sign in to comment.