Skip to content

Commit

Permalink
feat(python): Support "indexed" dicts as input to from_dict and `fr…
Browse files Browse the repository at this point in the history
…om_dicts`
  • Loading branch information
alexander-beedie committed Dec 18, 2024
1 parent 676f10d commit 6ff2099
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 9 deletions.
106 changes: 98 additions & 8 deletions py-polars/polars/convert/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,13 @@


def from_dict(
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
data: Mapping[
str, Sequence[object] | Mapping[str, Sequence[object]] | Series | object
],
schema: SchemaDefinition | None = None,
*,
schema_overrides: SchemaDict | None = None,
indexed: bool | str = False,
strict: bool = True,
) -> DataFrame:
"""
Expand All @@ -48,8 +51,8 @@ def from_dict(
Parameters
----------
data : dict of sequences
Two-dimensional data represented as a dictionary. dict must contain
Sequences.
Two-dimensional data represented as a dictionary; the dictionary is expected
to contain Sequences of values.
schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
The DataFrame schema may be declared in several ways:
Expand All @@ -63,6 +66,11 @@ def from_dict(
schema_overrides : dict, default None
Support type specification or override of one or more columns; note that
any dtypes inferred from the columns param will be overridden.
indexed : {bool, str}, default False
If True (or a string name), the `data` dictionary key is expected to represent
an index column, with values being dictionary records associated with that key.
If a string is passed then that will be the index column name, otherwise the
default value "index" is used.
strict : bool, default True
Throw an error if any `data` value does not exactly match the given or inferred
data type for that column. If set to `False`, values that do not match the data
Expand All @@ -75,6 +83,8 @@ def from_dict(
Examples
--------
Construct a DataFrame from a dictionary of sequences:
>>> df = pl.from_dict({"a": [1, 2], "b": [3, 4]})
>>> df
shape: (2, 2)
Expand All @@ -86,22 +96,60 @@ def from_dict(
│ 1 ┆ 3 │
│ 2 ┆ 4 │
└─────┴─────┘
Construct a DataFrame from an indexed dictionary of such data:
>>> df = pl.from_dict(
... data={
... "a": {"x": [1, 2], "y": [0.5, 2.5]},
... "b": {"x": [5, 6], "y": [5.0, 1.0]},
... },
... indexed=True,
... )
>>> df
shape: (4, 3)
┌───────┬─────┬─────┐
│ index ┆ x ┆ y │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 │
╞═══════╪═════╪═════╡
│ a ┆ 1 ┆ 0.5 │
│ a ┆ 2 ┆ 2.5 │
│ b ┆ 5 ┆ 5.0 │
│ b ┆ 6 ┆ 1.0 │
└───────┴─────┴─────┘
"""
return wrap_df(
dict_to_pydf(
data,
if indexed:
label = indexed if isinstance(indexed, str) else "index"
records = [
{label: idx, **dict(zip(values.keys(), v) if values else {})} # type: ignore[attr-defined,arg-type]
for idx, values in data.items()
for v in (zip(*values.values()) if values else (None,)) # type: ignore[attr-defined]
]
return from_records(
records,
schema=schema,
schema_overrides=schema_overrides,
strict=strict,
orient="row",
)
else:
return wrap_df(
dict_to_pydf(
data, # type: ignore[arg-type]
schema=schema,
schema_overrides=schema_overrides,
strict=strict,
)
)
)


def from_dicts(
data: Iterable[dict[str, Any]],
data: Iterable[Mapping[str, Any]] | Mapping[Any, Iterable[Mapping[str, Any]]],
schema: SchemaDefinition | None = None,
*,
schema_overrides: SchemaDict | None = None,
indexed: bool | str = False,
strict: bool = True,
infer_schema_length: int | None = N_INFER_DEFAULT,
) -> DataFrame:
Expand Down Expand Up @@ -130,6 +178,11 @@ def from_dicts(
adding them to the schema.
schema_overrides : dict, default None
Support override of inferred types for one or more columns.
indexed : {bool, str}, default False
If True (or a string name), the `data` dictionary key is expected to represent
an index column, with values being a list of dictionary records associated
with that key. If a string is passed then that will be the index column name,
otherwise the default value "idx" is used.
strict : bool, default True
Throw an error if any `data` value does not exactly match the given or inferred
data type for that column. If set to `False`, values that do not match the data
Expand Down Expand Up @@ -192,7 +245,44 @@ def from_dicts(
│ 2 ┆ 5 ┆ null ┆ null │
│ 3 ┆ 6 ┆ null ┆ null │
└─────┴─────┴──────┴──────┘
Indexed records can also be loaded straightforwardly:
>>> data = {
... "a": [
... {"w": None, "x": 1, "y": 2.5, "z": None},
... {"x": 8, "y": 5.0, "w": None, "z": None},
... ],
... "b": [
... {"x": None, "y": 2.0, "w": 0, "z": 8},
... {"x": None, "y": 3.0, "w": 0, "z": 7},
... ],
... "c": [
... {"y": None, "w": None, "z": None, "x": 0},
... ],
... }
>>> pl.from_dicts(data, indexed=True)
shape: (5, 5)
┌───────┬──────┬──────┬──────┬──────┐
│ index ┆ w ┆ x ┆ y ┆ z │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ f64 ┆ i64 │
╞═══════╪══════╪══════╪══════╪══════╡
│ a ┆ null ┆ 1 ┆ 2.5 ┆ null │
│ a ┆ null ┆ 8 ┆ 5.0 ┆ null │
│ b ┆ 0 ┆ null ┆ 2.0 ┆ 8 │
│ b ┆ 0 ┆ null ┆ 3.0 ┆ 7 │
│ c ┆ null ┆ 0 ┆ null ┆ null │
└───────┴──────┴──────┴──────┴──────┘
"""
if indexed:
label = indexed if isinstance(indexed, str) else "index"
data = [
{label: key, **record}
for key, records in data.items() # type: ignore[union-attr]
for record in records
]

if not data and not (schema or schema_overrides):
msg = "no data, cannot infer schema"
raise NoDataError(msg)
Expand Down
58 changes: 57 additions & 1 deletion py-polars/tests/unit/dataframe/test_from_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,5 +253,61 @@ def test_from_dict_cast_logical_type(dtype: pl.DataType, data: Any) -> None:
],
schema=schema,
)

assert_frame_equal(df_from_dicts, df)


def test_from_dict_indexed() -> None:
data = {
"a": {"x": [1, 8], "y": [2.5, 5.0]},
"b": {"w": [0, 0], "y": [2, 3], "z": [8, 7]},
"c": None,
}
expected_data = {
"index": ["a", "a", "b", "b", "c"],
"x": [1, 8, None, None, None],
"y": [2.5, 5.0, 2.0, 3.0, None],
"w": [None, None, 0, 0, None],
"z": [None, None, 8, 7, None],
}
expected_frame = pl.DataFrame(expected_data)

res = pl.from_dict(data, indexed=True)
assert_frame_equal(expected_frame, res)

res = pl.from_dict(data=data, indexed="key")
assert_frame_equal(expected_frame.rename({"index": "key"}), res)


def test_from_dicts_indexed() -> None:
df = pl.DataFrame(
{
"idx": ["a", "a", "b", "b", "c"],
"x": [1, 8, None, None, None],
"y": [2.5, 5.0, 2.0, 3.0, None],
"w": [None, None, 0, 0, None],
"z": [None, None, 8, 7, None],
}
)

# export records...
indexed_records = df.rows_by_key("idx", named=True)
assert indexed_records == {
"a": [
{"x": 1, "y": 2.5, "w": None, "z": None},
{"x": 8, "y": 5.0, "w": None, "z": None},
],
"b": [
{"x": None, "y": 2.0, "w": 0, "z": 8},
{"x": None, "y": 3.0, "w": 0, "z": 7},
],
"c": [
{"x": None, "y": None, "w": None, "z": None},
],
}

# ...and read them back
res = pl.from_dicts(indexed_records, indexed=True)
assert_frame_equal(res, df.rename({"idx": "index"}))

res = pl.from_dicts(data=indexed_records, indexed="key")
assert_frame_equal(res.rename({"key": "idx"}), df)

0 comments on commit 6ff2099

Please sign in to comment.