pola-rs · ritchie46 · Jan 24, 2025 · Jan 22, 2025
@@ -28,6 +28,7 @@
     Int64,
     Null,
     String,
+    Time,
     UInt8,
 )
 from polars.datatypes.group import FLOAT_DTYPES, INTEGER_DTYPES, NUMERIC_DTYPES
@@ -661,7 +662,7 @@ def _read_spreadsheet(
         infer_schema_length=infer_schema_length,
     )
     engine_options = (engine_options or {}).copy()
-    schema_overrides = pl.Schema(schema_overrides or {})
+    schema_overrides = dict(schema_overrides or {})
 
     # establish the reading function, parser, and available worksheets
     reader_fn, parser, worksheets = _initialise_spreadsheet_parser(
@@ -985,102 +986,6 @@ def _reorder_columns(
     return df
 
 
-def _read_spreadsheet_openpyxl(
-    parser: Any,
-    *,
-    sheet_name: str | None,
-    read_options: dict[str, Any],
-    schema_overrides: SchemaDict | None,
-    columns: Sequence[int] | Sequence[str] | None,
-    table_name: str | None = None,
-    drop_empty_rows: bool,
-    drop_empty_cols: bool,
-    raise_if_empty: bool,
-) -> pl.DataFrame:
-    """Use the 'openpyxl' library to read data from the given worksheet."""
-    infer_schema_length = read_options.pop("infer_schema_length", None)
-    has_header = read_options.pop("has_header", True)
-    no_inference = infer_schema_length == 0
-    header: list[str | None] = []
-
-    if table_name and not sheet_name:
-        sheet_name, n_tables = None, 0
-        for sheet in parser.worksheets:
-            n_tables += 1
-            if table_name in sheet.tables:
-                ws, sheet_name = sheet, sheet.title
-                break
-        if sheet_name is None:
-            msg = (
-                f"table named {table_name!r} not found in sheet {sheet_name!r}"
-                if n_tables
-                else f"no named tables found in sheet {sheet_name!r} (looking for {table_name!r})"
-            )
-            raise RuntimeError(msg)
-    else:
-        ws = parser[sheet_name]
-
-    # prefer detection of actual table objects; otherwise read
-    # data in the used worksheet range, dropping null columns
-    if tables := getattr(ws, "tables", None):
-        table = tables[table_name] if table_name else next(iter(tables.values()))
-        rows = list(ws[table.ref])
-        if not rows:
-            return _empty_frame(raise_if_empty)
-        if has_header:
-            header.extend(cell.value for cell in rows.pop(0))
-        else:
-            header.extend(f"column_{n}" for n in range(1, len(rows[0]) + 1))
-        if table.totalsRowCount:
-            rows = rows[: -table.totalsRowCount]
-        rows_iter = rows
-    elif table_name:
-        msg = f"no named tables found in sheet {sheet_name!r} (looking for {table_name!r})"
-        raise RuntimeError(msg)
-    else:
-        if not has_header:
-            if not (rows_iter := list(ws.iter_rows())):
-                return _empty_frame(raise_if_empty)
-            n_cols = len(rows_iter[0])
-            header = [f"column_{n}" for n in range(1, n_cols + 1)]
-        else:
-            rows_iter = ws.iter_rows()
-            for row in rows_iter:
-                row_values = [cell.value for cell in row]
-                if any(v is not None for v in row_values):
-                    header.extend(row_values)
-                    break
-
-    dtype = String if no_inference else None
-    series_data = []
-    for name, column_data in zip(header, zip(*rows_iter)):
-        if name or not drop_empty_cols:
-            values = [cell.value for cell in column_data]
-            if no_inference or (dtype := (schema_overrides or {}).get(name)) == String:  # type: ignore[assignment,arg-type]
-                # note: if we initialise the series with mixed-type data (eg: str/int)
-                # then the non-strings will become null, so we handle the cast here
-                values = [str(v) if (v is not None) else v for v in values]
-
-            s = pl.Series(name, values, dtype=dtype, strict=False)
-            series_data.append(s)
-
-    names = deduplicate_names(s.name for s in series_data)
-    df = pl.DataFrame(
-        dict(zip(names, series_data)),
-        schema_overrides=schema_overrides,
-        infer_schema_length=infer_schema_length,
-        strict=False,
-    )
-    df = _drop_null_data(
-        df,
-        raise_if_empty=raise_if_empty,
-        drop_empty_rows=drop_empty_rows,
-        drop_empty_cols=drop_empty_cols,
-    )
-    df = _reorder_columns(df, columns)
-    return df
-
-
 def _read_spreadsheet_calamine(
     parser: Any,
     *,
@@ -1130,10 +1035,6 @@ def _read_spreadsheet_calamine(
                     parser_dtypes[name] = "float"
                 elif base_dtype == String:
                     parser_dtypes[name] = "string"
-                elif base_dtype == Datetime:
-                    parser_dtypes[name] = "datetime"
-                elif base_dtype == Date:
-                    parser_dtypes[name] = "date"
                 elif base_dtype == Duration:
                     parser_dtypes[name] = "duration"
                 elif base_dtype == Boolean:
@@ -1170,7 +1071,30 @@ def _read_spreadsheet_calamine(
     # note: even if we applied parser dtypes we still re-apply schema_overrides
     # natively as we can refine integer/float types, temporal precision, etc.
     if schema_overrides:
-        df = df.cast(dtypes=schema_overrides)
+        lf, schema = df.lazy(), df.schema
+        str_to_temporal, updated_overrides = [], {}
+        for nm, tp in schema_overrides.items():
+            if schema[nm] != String:
+                updated_overrides[nm] = tp
+            elif tp == Datetime:
+                str_to_temporal.append(
+                    F.col(nm).str.to_datetime(
+                        time_unit=getattr(tp, "time_unit", None),
+                        time_zone=getattr(tp, "time_zone", None),
+                    )
+                )
+            elif tp == Date:
+                str_to_temporal.append(F.col(nm).str.to_date())
+            elif tp == Time:
+                str_to_temporal.append(F.col(nm).str.to_time())
+            else:
+                updated_overrides[nm] = tp
+
+        if str_to_temporal:
+            lf = lf.with_columns(*str_to_temporal)
+        if updated_overrides:
+            lf = lf.cast(dtypes=updated_overrides)
+        df = lf.collect()
 
     # standardise on string dtype for null columns in empty frame
     if df.is_empty():
@@ -1205,6 +1129,116 @@ def _read_spreadsheet_calamine(
     return df
 
 
+def _read_spreadsheet_openpyxl(
+    parser: Any,
+    *,
+    sheet_name: str | None,
+    read_options: dict[str, Any],
+    schema_overrides: SchemaDict | None,
+    columns: Sequence[int] | Sequence[str] | None,
+    table_name: str | None = None,
+    drop_empty_rows: bool,
+    drop_empty_cols: bool,
+    raise_if_empty: bool,
+) -> pl.DataFrame:
+    """Use the 'openpyxl' library to read data from the given worksheet."""
+    infer_schema_length = read_options.pop("infer_schema_length", None)
+    has_header = read_options.pop("has_header", True)
+    schema_overrides = schema_overrides or {}
+    no_inference = infer_schema_length == 0
+    header: list[str | None] = []
+
+    if table_name and not sheet_name:
+        sheet_name, n_tables = None, 0
+        for sheet in parser.worksheets:
+            n_tables += 1
+            if table_name in sheet.tables:
+                ws, sheet_name = sheet, sheet.title
+                break
+        if sheet_name is None:
+            msg = (
+                f"table named {table_name!r} not found in sheet {sheet_name!r}"
+                if n_tables
+                else f"no named tables found in sheet {sheet_name!r} (looking for {table_name!r})"
+            )
+            raise RuntimeError(msg)
+    else:
+        ws = parser[sheet_name]
+
+    # prefer detection of actual table objects; otherwise read
+    # data in the used worksheet range, dropping null columns
+    if tables := getattr(ws, "tables", None):
+        table = tables[table_name] if table_name else next(iter(tables.values()))
+        rows = list(ws[table.ref])
+        if not rows:
+            return _empty_frame(raise_if_empty)
+        if has_header:
+            header.extend(cell.value for cell in rows.pop(0))
+        else:
+            header.extend(f"column_{n}" for n in range(1, len(rows[0]) + 1))
+        if table.totalsRowCount:
+            rows = rows[: -table.totalsRowCount]
+        rows_iter = rows
+    elif table_name:
+        msg = f"no named tables found in sheet {sheet_name!r} (looking for {table_name!r})"
+        raise RuntimeError(msg)
+    else:
+        if not has_header:
+            if not (rows_iter := list(ws.iter_rows())):
+                return _empty_frame(raise_if_empty)
+            n_cols = len(rows_iter[0])
+            header = [f"column_{n}" for n in range(1, n_cols + 1)]
+        else:
+            rows_iter = ws.iter_rows()
+            for row in rows_iter:
+                row_values = [cell.value for cell in row]
+                if any(v is not None for v in row_values):
+                    header.extend(row_values)
+                    break
+
+    dtype = String if no_inference else None
+    series_data = []
+    for name, column_data in zip(header, zip(*rows_iter)):
+        if name or not drop_empty_cols:
+            values = [cell.value for cell in column_data]
+            if no_inference or (dtype := schema_overrides.get(name)) == String:  # type: ignore[assignment,arg-type]
+                # note: if we initialise the series with mixed-type data (eg: str/int)
+                # then the non-strings will become null, so we handle the cast here
+                values = [str(v) if (v is not None) else v for v in values]
+
+            if (tp := schema_overrides.get(name)) in (Date, Datetime, Time):  # type: ignore[operator,arg-type]
+                s = pl.Series(name, values, strict=False)
+                if s.dtype == String:
+                    if tp == Datetime:
+                        s = s.str.to_datetime(
+                            time_unit=getattr(tp, "time_unit", None),
+                            time_zone=getattr(tp, "time_zone", None),
+                        )
+                    elif tp == Date:
+                        s = s.str.strip_suffix(" 00:00:00").str.to_date()
+                    elif tp == Time:
+                        s = s.str.to_time()
+            else:
+                s = pl.Series(name, values, dtype=dtype, strict=False)
+            series_data.append(s)
+
+    names = deduplicate_names(s.name for s in series_data)
+    df = pl.DataFrame(
+        dict(zip(names, series_data)),
+        schema_overrides=schema_overrides,
+        infer_schema_length=infer_schema_length,
+        strict=False,
+    )
+    df = _drop_null_data(
+        df,
+        raise_if_empty=raise_if_empty,
+        drop_empty_rows=drop_empty_rows,
+        drop_empty_cols=drop_empty_cols,
+    )
+    df = _reorder_columns(df, columns)
+    return df
+
+
 def _read_spreadsheet_xlsx2csv(
     parser: Any,
     *,

@@ -96,7 +96,7 @@ def to_local(self) -> Series:
     @unstable()
     def uses_lexical_ordering(self) -> bool:
         """
-        Return whether or not the series uses lexical ordering.
+        Indicate whether the Series uses lexical ordering.
 
         .. warning::
             This functionality is considered **unstable**. It may be changed

@@ -2060,9 +2060,9 @@ def find_many(
         overlapping: bool = False,
     ) -> Series:
         """
-        Use the Aho-Corasick algorithm to find many matches.
+        Use the Aho-Corasick algorithm to find all matches.
 
-        The function will return the bytes offset of the start of each match.
+        The function returns the byte offset of the start of each match.
         The return type will be `List<UInt32>`
 
         Parameters