From 40d9bef87342732b67cd4f0db42ae67e7114a14f Mon Sep 17 00:00:00 2001 From: ion-elgreco <15728914+ion-elgreco@users.noreply.github.com> Date: Thu, 21 Dec 2023 22:23:39 +0100 Subject: [PATCH 1/2] more arr dist --- polars_distance/polars_distance/Cargo.toml | 2 +- .../polars_distance/__init__.py | 46 +++++++ polars_distance/polars_distance/src/array.rs | 119 ++++++++++++++++-- .../polars_distance/src/expressions.rs | 93 +++++++++++++- polars_distance/tests/test_distance_arr.py | 31 +++++ 5 files changed, 275 insertions(+), 16 deletions(-) diff --git a/polars_distance/polars_distance/Cargo.toml b/polars_distance/polars_distance/Cargo.toml index 87e0fce..e96e1cd 100644 --- a/polars_distance/polars_distance/Cargo.toml +++ b/polars_distance/polars_distance/Cargo.toml @@ -8,7 +8,7 @@ name = "polars_distance" crate-type = ["cdylib"] [dependencies] -polars = { version = "*" , features = ["dtype-array", 'dtype-categorical']} +polars = { version = "*" , features = ["dtype-array", 'dtype-categorical', 'dtype-u16', 'dtype-u8', 'dtype-i8','dtype-i16']} polars-core = {version = "*"} polars-arrow = {version = "*"} pyo3 = { version = "0.20", features = ["extension-module"] } diff --git a/polars_distance/polars_distance/polars_distance/__init__.py b/polars_distance/polars_distance/polars_distance/__init__.py index 4f3a60a..6fabb3b 100644 --- a/polars_distance/polars_distance/polars_distance/__init__.py +++ b/polars_distance/polars_distance/polars_distance/__init__.py @@ -49,6 +49,52 @@ def canberra(self, other: IntoExpr) -> pl.Expr: is_elementwise=True, ) + def bray_curtis(self, other: IntoExpr) -> pl.Expr: + """Returns chebyshev distance between two vectors""" + return self._expr.register_plugin( + lib=lib, + args=[other], + symbol="bray_curtis_arr", + is_elementwise=True, + ) + + def manhatten(self, other: IntoExpr) -> pl.Expr: + """Returns manhatten distance between two vectors""" + return self._expr.register_plugin( + lib=lib, + args=[other], + symbol="manhatten_arr", + is_elementwise=True, + ) + + def minkowski(self, other: IntoExpr, p: int) -> pl.Expr: + """Returns minkowski distance between two vectors""" + return self._expr.register_plugin( + lib=lib, + args=[other], + kwargs={"p": p}, + symbol="minkowski_arr", + is_elementwise=True, + ) + + def l3_norm(self, other: IntoExpr) -> pl.Expr: + """Returns l3_norm distance between two vectors""" + return self._expr.register_plugin( + lib=lib, + args=[other], + symbol="l3_norm_arr", + is_elementwise=True, + ) + + def l4_norm(self, other: IntoExpr) -> pl.Expr: + """Returns l4_norm distance between two vectors""" + return self._expr.register_plugin( + lib=lib, + args=[other], + symbol="l4_norm_arr", + is_elementwise=True, + ) + @pl.api.register_expr_namespace("dist_str") class DistancePairWiseString: diff --git a/polars_distance/polars_distance/src/array.rs b/polars_distance/polars_distance/src/array.rs index c9294e9..1ab3bc2 100644 --- a/polars_distance/polars_distance/src/array.rs +++ b/polars_distance/polars_distance/src/array.rs @@ -1,3 +1,4 @@ +use distances::vectors::minkowski; use polars::prelude::arity::try_binary_elementwise; use polars::prelude::*; use polars_arrow::array::{Array, PrimitiveArray}; @@ -7,11 +8,20 @@ fn collect_into_vecf64(arr: Box) -> Vec { .downcast_ref::>() .unwrap() .values_iter() - .map(|v| *v) + .copied() .collect::>() } -pub fn distance_calc_float_inp( +fn collect_into_uint64(arr: Box) -> Vec { + arr.as_any() + .downcast_ref::>() + .unwrap() + .values_iter() + .copied() + .collect::>() +} + +pub fn distance_calc_numeric_inp( a: &ChunkedArray, b: &ChunkedArray, f: fn(&[f64], &[f64]) -> f64, @@ -21,11 +31,17 @@ pub fn distance_calc_float_inp( ComputeError: "inner data types don't match" ); polars_ensure!( - a.inner_dtype().is_float(), - ComputeError: "inner data types must be float" + a.inner_dtype().is_numeric(), + ComputeError: "inner data types must be numeric" ); - try_binary_elementwise(a, b, |a: Option>, b| match (a, b) { + let s1 = a.cast(&DataType::Array(Box::new(DataType::Float64), a.width()))?; + let s2 = b.cast(&DataType::Array(Box::new(DataType::Float64), a.width()))?; + + let a: &ArrayChunked = s1.array()?; + let b: &ArrayChunked = s2.array()?; + + try_binary_elementwise(a, b, |a, b| match (a, b) { (Some(a), Some(b)) => { if a.null_count() > 0 || b.null_count() > 0 { polars_bail!(ComputeError: "array cannot contain nulls") @@ -39,6 +55,40 @@ pub fn distance_calc_float_inp( }) } +pub fn distance_calc_uint_inp( + a: &ChunkedArray, + b: &ChunkedArray, + f: fn(&[u64], &[u64]) -> f64, +) -> PolarsResult { + polars_ensure!( + a.inner_dtype() == b.inner_dtype(), + ComputeError: "inner data types don't match" + ); + polars_ensure!( + a.inner_dtype().is_unsigned_integer(), + ComputeError: "inner data types must be unsigned integer" + ); + + let s1 = a.cast(&DataType::Array(Box::new(DataType::UInt64), a.width()))?; + let s2 = b.cast(&DataType::Array(Box::new(DataType::UInt64), a.width()))?; + + let a: &ArrayChunked = s1.array()?; + let b: &ArrayChunked = s2.array()?; + + try_binary_elementwise(a, b, |a, b| match (a, b) { + (Some(a), Some(b)) => { + if a.null_count() > 0 || b.null_count() > 0 { + polars_bail!(ComputeError: "array cannot contain nulls") + } else { + let a = &collect_into_uint64(a); + let b = &collect_into_uint64(b); + Ok(Some(f(a, b))) + } + } + _ => Ok(None), + }) +} + pub fn euclidean_dist( a: &ChunkedArray, b: &ChunkedArray, @@ -48,11 +98,17 @@ pub fn euclidean_dist( ComputeError: "inner data types don't match" ); polars_ensure!( - a.inner_dtype().is_float(), - ComputeError: "inner data types must be float" + a.inner_dtype().is_numeric(), + ComputeError: "inner data types must be numeric" ); - try_binary_elementwise(a, b, |a: Option>, b| match (a, b) { + let s1 = a.cast(&DataType::Array(Box::new(DataType::Float64), a.width()))?; + let s2 = b.cast(&DataType::Array(Box::new(DataType::Float64), a.width()))?; + + let a: &ArrayChunked = s1.array()?; + let b: &ArrayChunked = s2.array()?; + + try_binary_elementwise(a, b, |a, b| match (a, b) { (Some(a), Some(b)) => { if a.null_count() > 0 || b.null_count() > 0 { polars_bail!(ComputeError: "array cannot contain nulls") @@ -85,11 +141,17 @@ pub fn cosine_dist( ComputeError: "inner data types don't match" ); polars_ensure!( - a.inner_dtype().is_float(), - ComputeError: "inner data types must be float" + a.inner_dtype().is_numeric(), + ComputeError: "inner data types must be numeric" ); - try_binary_elementwise(a, b, |a: Option>, b| match (a, b) { + let s1 = a.cast(&DataType::Array(Box::new(DataType::Float64), a.width()))?; + let s2 = b.cast(&DataType::Array(Box::new(DataType::Float64), a.width()))?; + + let a: &ArrayChunked = s1.array()?; + let b: &ArrayChunked = s2.array()?; + + try_binary_elementwise(a, b, |a, b| match (a, b) { (Some(a), Some(b)) => { if a.null_count() > 0 || b.null_count() > 0 { polars_bail!(ComputeError: "array cannot contain nulls") @@ -120,3 +182,38 @@ pub fn cosine_dist( _ => Ok(None), }) } + +pub fn minkowski_dist( + a: &ChunkedArray, + b: &ChunkedArray, + p: i32, +) -> PolarsResult { + polars_ensure!( + a.inner_dtype() == b.inner_dtype(), + ComputeError: "inner data types don't match" + ); + polars_ensure!( + a.inner_dtype().is_numeric(), + ComputeError: "inner data types must be numeric" + ); + + let s1 = a.cast(&DataType::Array(Box::new(DataType::Float64), a.width()))?; + let s2 = b.cast(&DataType::Array(Box::new(DataType::Float64), a.width()))?; + + let a: &ArrayChunked = s1.array()?; + let b: &ArrayChunked = s2.array()?; + + try_binary_elementwise(a, b, |a, b| match (a, b) { + (Some(a), Some(b)) => { + if a.null_count() > 0 || b.null_count() > 0 { + polars_bail!(ComputeError: "array cannot contain nulls") + } else { + let a = &collect_into_vecf64(a); + let b = &collect_into_vecf64(b); + let metric = minkowski(p); + Ok(Some(metric(a, b))) + } + } + _ => Ok(None), + }) +} diff --git a/polars_distance/polars_distance/src/expressions.rs b/polars_distance/polars_distance/src/expressions.rs index 4495821..af2bfe9 100644 --- a/polars_distance/polars_distance/src/expressions.rs +++ b/polars_distance/polars_distance/src/expressions.rs @@ -1,4 +1,6 @@ -use crate::array::{cosine_dist, distance_calc_float_inp, euclidean_dist}; +use crate::array::{ + cosine_dist, distance_calc_numeric_inp, distance_calc_uint_inp, euclidean_dist, minkowski_dist, +}; use crate::list::{ cosine_set_distance, jaccard_index, overlap_coef, sorensen_index, tversky_index, }; @@ -9,7 +11,7 @@ use crate::string::{ levenshtein_normalized_dist, osa_dist, osa_normalized_dist, postfix_dist, postfix_normalized_dist, prefix_dist, prefix_normalized_dist, }; -use distances::vectors::{canberra, chebyshev}; +use distances::vectors::{bray_curtis, canberra, chebyshev, l3_norm, l4_norm, manhattan}; use polars::prelude::*; use pyo3_polars::derive::polars_expr; use serde::Deserialize; @@ -20,6 +22,12 @@ struct TverskyIndexKwargs { beta: f64, } +#[derive(Deserialize)] +struct MinkowskiKwargs { + p: i32, +} + +// STR EXPRESSIONS #[polars_expr(output_type=UInt32)] fn hamming_str(inputs: &[Series]) -> PolarsResult { if inputs[0].dtype() != &DataType::Utf8 || inputs[1].dtype() != &DataType::Utf8 { @@ -261,6 +269,7 @@ fn prefix_normalized_str(inputs: &[Series]) -> PolarsResult { Ok(out.into_series()) } +// ARRAY EXPRESSIONS #[polars_expr(output_type=Float64)] fn euclidean_arr(inputs: &[Series]) -> PolarsResult { let x: &ArrayChunked = inputs[0].array()?; @@ -272,6 +281,7 @@ fn euclidean_arr(inputs: &[Series]) -> PolarsResult { `{}` width: {}, `{}` width: {}", inputs[0].name(), x.width(), inputs[1].name(), y.width()); } + euclidean_dist(x, y).map(|ca| ca.into_series()) } @@ -289,6 +299,20 @@ fn cosine_arr(inputs: &[Series]) -> PolarsResult { cosine_dist(x, y).map(|ca| ca.into_series()) } +#[polars_expr(output_type=Float64)] +fn minkowski_arr(inputs: &[Series], kwargs: MinkowskiKwargs) -> PolarsResult { + let x: &ArrayChunked = inputs[0].array()?; + let y: &ArrayChunked = inputs[1].array()?; + + if x.width() != y.width() { + polars_bail!(InvalidOperation: + "The dimensions of each array are not the same. + `{}` width: {}, + `{}` width: {}", inputs[0].name(), x.width(), inputs[1].name(), y.width()); + } + minkowski_dist(x, y, kwargs.p).map(|ca| ca.into_series()) +} + #[polars_expr(output_type=Float64)] fn chebyshev_arr(inputs: &[Series]) -> PolarsResult { let x: &ArrayChunked = inputs[0].array()?; @@ -300,7 +324,7 @@ fn chebyshev_arr(inputs: &[Series]) -> PolarsResult { `{}` width: {}, `{}` width: {}", inputs[0].name(), x.width(), inputs[1].name(), y.width()); } - distance_calc_float_inp(x, y, chebyshev).map(|ca| ca.into_series()) + distance_calc_numeric_inp(x, y, chebyshev).map(|ca| ca.into_series()) } #[polars_expr(output_type=Float64)] @@ -314,9 +338,70 @@ fn canberra_arr(inputs: &[Series]) -> PolarsResult { `{}` width: {}, `{}` width: {}", inputs[0].name(), x.width(), inputs[1].name(), y.width()); } - distance_calc_float_inp(x, y, canberra).map(|ca| ca.into_series()) + distance_calc_numeric_inp(x, y, canberra).map(|ca| ca.into_series()) +} + +#[polars_expr(output_type=Float64)] +fn manhatten_arr(inputs: &[Series]) -> PolarsResult { + let x: &ArrayChunked = inputs[0].array()?; + let y: &ArrayChunked = inputs[1].array()?; + + if x.width() != y.width() { + polars_bail!(InvalidOperation: + "The dimensions of each array are not the same. + `{}` width: {}, + `{}` width: {}", inputs[0].name(), x.width(), inputs[1].name(), y.width()); + } + + distance_calc_numeric_inp(x, y, manhattan).map(|ca| ca.into_series()) +} + +#[polars_expr(output_type=Float64)] +fn l3_norm_arr(inputs: &[Series]) -> PolarsResult { + let x: &ArrayChunked = inputs[0].array()?; + let y: &ArrayChunked = inputs[1].array()?; + + if x.width() != y.width() { + polars_bail!(InvalidOperation: + "The dimensions of each array are not the same. + `{}` width: {}, + `{}` width: {}", inputs[0].name(), x.width(), inputs[1].name(), y.width()); + } + + distance_calc_numeric_inp(x, y, l3_norm).map(|ca| ca.into_series()) +} + +#[polars_expr(output_type=Float64)] +fn l4_norm_arr(inputs: &[Series]) -> PolarsResult { + let x: &ArrayChunked = inputs[0].array()?; + let y: &ArrayChunked = inputs[1].array()?; + + if x.width() != y.width() { + polars_bail!(InvalidOperation: + "The dimensions of each array are not the same. + `{}` width: {}, + `{}` width: {}", inputs[0].name(), x.width(), inputs[1].name(), y.width()); + } + + distance_calc_numeric_inp(x, y, l4_norm).map(|ca| ca.into_series()) +} + +#[polars_expr(output_type=Float64)] +fn bray_curtis_arr(inputs: &[Series]) -> PolarsResult { + let x: &ArrayChunked = inputs[0].array()?; + let y: &ArrayChunked = inputs[1].array()?; + + if x.width() != y.width() { + polars_bail!(InvalidOperation: + "The dimensions of each array are not the same. + `{}` width: {}, + `{}` width: {}", inputs[0].name(), x.width(), inputs[1].name(), y.width()); + } + + distance_calc_uint_inp(x, y, bray_curtis).map(|ca| ca.into_series()) } +// SET (list) EXPRESSIONS #[polars_expr(output_type=Float64)] fn jaccard_index_list(inputs: &[Series]) -> PolarsResult { let x: &ChunkedArray = inputs[0].list()?; diff --git a/polars_distance/tests/test_distance_arr.py b/polars_distance/tests/test_distance_arr.py index c3ad00c..a0e034a 100644 --- a/polars_distance/tests/test_distance_arr.py +++ b/polars_distance/tests/test_distance_arr.py @@ -76,6 +76,37 @@ def test_canberra(data): assert_frame_equal(result, expected) +def test_bray_curtis(data): + result = data.select( + pld.col("arr") + .cast(pl.Array(pl.UInt64, 4)) + .dist_arr.bray_curtis(pl.col("arr2").cast(pl.Array(pl.UInt64, 4))) + .alias("dist_bray"), + ) + + expected = pl.DataFrame( + [ + pl.Series("dist_bray", [0.5], dtype=pl.Float64), + ] + ) + + assert_frame_equal(result, expected) + + +def test_manhatten(data): + result = data.select( + pld.col("arr").dist_arr.manhatten("arr2").alias("dist_manhatten"), + ) + + expected = pl.DataFrame( + [ + pl.Series("dist_manhatten", [18.0], dtype=pl.Float64), + ] + ) + + assert_frame_equal(result, expected) + + def test_euclidean(data): result = data.select( pld.col("arr").dist_arr.euclidean("arr2").alias("dist_euclidean"), From 7ad94a3f169e84f0c66366797816096a8dd2bea0 Mon Sep 17 00:00:00 2001 From: ion-elgreco <15728914+ion-elgreco@users.noreply.github.com> Date: Thu, 21 Dec 2023 22:34:06 +0100 Subject: [PATCH 2/2] readme update --- README.md | 16 +++++++++++++++- polars_distance/polars_distance/README.md | 16 +++++++++++++++- .../polars_distance/polars_distance/__init__.py | 5 ++++- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 124095e..e3e4ef7 100644 --- a/README.md +++ b/README.md @@ -7,14 +7,28 @@ The plugin provides three namespaces: - dist_str - hamming - levenshtein + - damerau_levenshtein + - indel + - jaro + - jaro_winkler + - lcs_seq + - osa + - postfix + - prefix - dist_arr - euclidean - cosine - chebyshev - canberra + - bray_curtis + - manhatten + - minkowski + - l3_norm + - l4_norm - dist_list (these act as set similary metrics) - jaccard_index - sorensen_index + - tversky_index - overlap_coef - cosine @@ -37,7 +51,7 @@ df.select( │ --- │ │ u32 │ ╞══════╡ -│ 1 │ +│ 7 │ └──────┘ diff --git a/polars_distance/polars_distance/README.md b/polars_distance/polars_distance/README.md index 124095e..e3e4ef7 100644 --- a/polars_distance/polars_distance/README.md +++ b/polars_distance/polars_distance/README.md @@ -7,14 +7,28 @@ The plugin provides three namespaces: - dist_str - hamming - levenshtein + - damerau_levenshtein + - indel + - jaro + - jaro_winkler + - lcs_seq + - osa + - postfix + - prefix - dist_arr - euclidean - cosine - chebyshev - canberra + - bray_curtis + - manhatten + - minkowski + - l3_norm + - l4_norm - dist_list (these act as set similary metrics) - jaccard_index - sorensen_index + - tversky_index - overlap_coef - cosine @@ -37,7 +51,7 @@ df.select( │ --- │ │ u32 │ ╞══════╡ -│ 1 │ +│ 7 │ └──────┘ diff --git a/polars_distance/polars_distance/polars_distance/__init__.py b/polars_distance/polars_distance/polars_distance/__init__.py index 6fabb3b..151d438 100644 --- a/polars_distance/polars_distance/polars_distance/__init__.py +++ b/polars_distance/polars_distance/polars_distance/__init__.py @@ -102,7 +102,10 @@ def __init__(self, expr: pl.Expr): self._expr = expr def hamming(self, other: IntoExpr, normalized: bool = False) -> pl.Expr: - """Returns hamming distance between two expressions""" + """Returns hamming distance between two expressions. + + The length of the shortest string is padded to the length of longest string. + """ if normalized: return self._expr.register_plugin( lib=lib,