feat(python): Support "indexed" dicts as input to from_dict and `fr…

…om_dicts`
pola-rs · Jan 14, 2025 · e7cecff · e7cecff
1 parent 8dcaec2
commit e7cecff
Show file tree

Hide file tree

Showing 2 changed files with 155 additions and 9 deletions.
diff --git a/py-polars/polars/convert/general.py b/py-polars/polars/convert/general.py
@@ -34,10 +34,13 @@
 
 
 def from_dict(
-    data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series],
+    data: Mapping[
+        str, Sequence[object] | Mapping[str, Sequence[object]] | Series | object
+    ],
     schema: SchemaDefinition | None = None,
     *,
     schema_overrides: SchemaDict | None = None,
+    indexed: bool | str = False,
     strict: bool = True,
 ) -> DataFrame:
     """
@@ -48,8 +51,8 @@ def from_dict(
     Parameters
     ----------
     data : dict of sequences
-        Two-dimensional data represented as a dictionary. dict must contain
-        Sequences.
+        Two-dimensional data represented as a dictionary; the dictionary is expected
+        to contain Sequences of values.
     schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
         The DataFrame schema may be declared in several ways:
 
@@ -63,6 +66,11 @@ def from_dict(
     schema_overrides : dict, default None
         Support type specification or override of one or more columns; note that
         any dtypes inferred from the columns param will be overridden.
+    indexed : {bool, str}, default False
+        If True (or a string name), the `data` dictionary key is expected to represent
+        an index column, with values being dictionary records associated with that key.
+        If a string is passed then that will be the index column name, otherwise the
+        default value "index" is used.
     strict : bool, default True
         Throw an error if any `data` value does not exactly match the given or inferred
         data type for that column. If set to `False`, values that do not match the data
@@ -75,6 +83,8 @@ def from_dict(
 
     Examples
     --------
+    Construct a DataFrame from a dictionary of sequences:
+
     >>> df = pl.from_dict({"a": [1, 2], "b": [3, 4]})
     >>> df
     shape: (2, 2)
@@ -86,22 +96,60 @@ def from_dict(
     │ 1   ┆ 3   │
     │ 2   ┆ 4   │
     └─────┴─────┘
+
+    Construct a DataFrame from an indexed dictionary of such data:
+
+    >>> df = pl.from_dict(
+    ...     data={
+    ...         "a": {"x": [1, 2], "y": [0.5, 2.5]},
+    ...         "b": {"x": [5, 6], "y": [5.0, 1.0]},
+    ...     },
+    ...     indexed=True,
+    ... )
+    >>> df
+    shape: (4, 3)
+    ┌───────┬─────┬─────┐
+    │ index ┆ x   ┆ y   │
+    │ ---   ┆ --- ┆ --- │
+    │ str   ┆ i64 ┆ f64 │
+    ╞═══════╪═════╪═════╡
+    │ a     ┆ 1   ┆ 0.5 │
+    │ a     ┆ 2   ┆ 2.5 │
+    │ b     ┆ 5   ┆ 5.0 │
+    │ b     ┆ 6   ┆ 1.0 │
+    └───────┴─────┴─────┘
     """
-    return wrap_df(
-        dict_to_pydf(
-            data,
+    if indexed:
+        label = indexed if isinstance(indexed, str) else "index"
+        records = [
+            {label: idx, **dict(zip(values.keys(), v) if values else {})}  # type: ignore[attr-defined,arg-type]
+            for idx, values in data.items()
+            for v in (zip(*values.values()) if values else (None,))  # type: ignore[attr-defined]
+        ]
+        return from_records(
+            records,
             schema=schema,
             schema_overrides=schema_overrides,
             strict=strict,
+            orient="row",
+        )
+    else:
+        return wrap_df(
+            dict_to_pydf(
+                data,  # type: ignore[arg-type]
+                schema=schema,
+                schema_overrides=schema_overrides,
+                strict=strict,
+            )
         )
-    )
 
 
 def from_dicts(
-    data: Iterable[dict[str, Any]],
+    data: Iterable[Mapping[str, Any]] | Mapping[Any, Iterable[Mapping[str, Any]]],
     schema: SchemaDefinition | None = None,
     *,
     schema_overrides: SchemaDict | None = None,
+    indexed: bool | str = False,
     strict: bool = True,
     infer_schema_length: int | None = N_INFER_DEFAULT,
 ) -> DataFrame:
@@ -130,6 +178,11 @@ def from_dicts(
         adding them to the schema.
     schema_overrides : dict, default None
         Support override of inferred types for one or more columns.
+    indexed : {bool, str}, default False
+        If True (or a string name), the `data` dictionary key is expected to represent
+        an index column, with values being a list of dictionary records associated
+        with that key. If a string is passed then that will be the index column name,
+        otherwise the default value "idx" is used.
     strict : bool, default True
         Throw an error if any `data` value does not exactly match the given or inferred
         data type for that column. If set to `False`, values that do not match the data
@@ -192,7 +245,44 @@ def from_dicts(
     │ 2   ┆ 5   ┆ null ┆ null │
     │ 3   ┆ 6   ┆ null ┆ null │
     └─────┴─────┴──────┴──────┘
+
+    Indexed records can also be loaded straightforwardly:
+
+    >>> data = {
+    ...     "a": [
+    ...         {"w": None, "x": 1, "y": 2.5, "z": None},
+    ...         {"x": 8, "y": 5.0, "w": None, "z": None},
+    ...     ],
+    ...     "b": [
+    ...         {"x": None, "y": 2.0, "w": 0, "z": 8},
+    ...         {"x": None, "y": 3.0, "w": 0, "z": 7},
+    ...     ],
+    ...     "c": [
+    ...         {"y": None, "w": None, "z": None, "x": 0},
+    ...     ],
+    ... }
+    >>> pl.from_dicts(data, indexed=True)
+    shape: (5, 5)
+    ┌───────┬──────┬──────┬──────┬──────┐
+    │ index ┆ w    ┆ x    ┆ y    ┆ z    │
+    │ ---   ┆ ---  ┆ ---  ┆ ---  ┆ ---  │
+    │ str   ┆ i64  ┆ i64  ┆ f64  ┆ i64  │
+    ╞═══════╪══════╪══════╪══════╪══════╡
+    │ a     ┆ null ┆ 1    ┆ 2.5  ┆ null │
+    │ a     ┆ null ┆ 8    ┆ 5.0  ┆ null │
+    │ b     ┆ 0    ┆ null ┆ 2.0  ┆ 8    │
+    │ b     ┆ 0    ┆ null ┆ 3.0  ┆ 7    │
+    │ c     ┆ null ┆ 0    ┆ null ┆ null │
+    └───────┴──────┴──────┴──────┴──────┘
     """
+    if indexed:
+        label = indexed if isinstance(indexed, str) else "index"
+        data = [
+            {label: key, **record}
+            for key, records in data.items()  # type: ignore[union-attr]
+            for record in records
+        ]
+
     if not data and not (schema or schema_overrides):
         msg = "no data, cannot infer schema"
         raise NoDataError(msg)

diff --git a/py-polars/tests/unit/dataframe/test_from_dict.py b/py-polars/tests/unit/dataframe/test_from_dict.py
@@ -253,5 +253,61 @@ def test_from_dict_cast_logical_type(dtype: pl.DataType, data: Any) -> None:
         ],
         schema=schema,
     )
-
     assert_frame_equal(df_from_dicts, df)
+
+
+def test_from_dict_indexed() -> None:
+    data = {
+        "a": {"x": [1, 8], "y": [2.5, 5.0]},
+        "b": {"w": [0, 0], "y": [2, 3], "z": [8, 7]},
+        "c": None,
+    }
+    expected_data = {
+        "index": ["a", "a", "b", "b", "c"],
+        "x": [1, 8, None, None, None],
+        "y": [2.5, 5.0, 2.0, 3.0, None],
+        "w": [None, None, 0, 0, None],
+        "z": [None, None, 8, 7, None],
+    }
+    expected_frame = pl.DataFrame(expected_data)
+
+    res = pl.from_dict(data, indexed=True)
+    assert_frame_equal(expected_frame, res)
+
+    res = pl.from_dict(data=data, indexed="key")
+    assert_frame_equal(expected_frame.rename({"index": "key"}), res)
+
+
+def test_from_dicts_indexed() -> None:
+    df = pl.DataFrame(
+        {
+            "idx": ["a", "a", "b", "b", "c"],
+            "x": [1, 8, None, None, None],
+            "y": [2.5, 5.0, 2.0, 3.0, None],
+            "w": [None, None, 0, 0, None],
+            "z": [None, None, 8, 7, None],
+        }
+    )
+
+    # export records...
+    indexed_records = df.rows_by_key("idx", named=True)
+    assert indexed_records == {
+        "a": [
+            {"x": 1, "y": 2.5, "w": None, "z": None},
+            {"x": 8, "y": 5.0, "w": None, "z": None},
+        ],
+        "b": [
+            {"x": None, "y": 2.0, "w": 0, "z": 8},
+            {"x": None, "y": 3.0, "w": 0, "z": 7},
+        ],
+        "c": [
+            {"x": None, "y": None, "w": None, "z": None},
+        ],
+    }
+
+    # ...and read them back
+    res = pl.from_dicts(indexed_records, indexed=True)
+    assert_frame_equal(res, df.rename({"idx": "index"}))
+
+    res = pl.from_dicts(data=indexed_records, indexed="key")
+    assert_frame_equal(res.rename({"key": "idx"}), df)