Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): Improve string → temporal parsing in read_excel and read_ods #20845

Merged
merged 1 commit into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
238 changes: 136 additions & 102 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
Int64,
Null,
String,
Time,
UInt8,
)
from polars.datatypes.group import FLOAT_DTYPES, INTEGER_DTYPES, NUMERIC_DTYPES
Expand Down Expand Up @@ -661,7 +662,7 @@ def _read_spreadsheet(
infer_schema_length=infer_schema_length,
)
engine_options = (engine_options or {}).copy()
schema_overrides = pl.Schema(schema_overrides or {})
schema_overrides = dict(schema_overrides or {})

# establish the reading function, parser, and available worksheets
reader_fn, parser, worksheets = _initialise_spreadsheet_parser(
Expand Down Expand Up @@ -985,102 +986,6 @@ def _reorder_columns(
return df


def _read_spreadsheet_openpyxl(
parser: Any,
*,
sheet_name: str | None,
read_options: dict[str, Any],
schema_overrides: SchemaDict | None,
columns: Sequence[int] | Sequence[str] | None,
table_name: str | None = None,
drop_empty_rows: bool,
drop_empty_cols: bool,
raise_if_empty: bool,
) -> pl.DataFrame:
"""Use the 'openpyxl' library to read data from the given worksheet."""
infer_schema_length = read_options.pop("infer_schema_length", None)
has_header = read_options.pop("has_header", True)
no_inference = infer_schema_length == 0
header: list[str | None] = []

if table_name and not sheet_name:
sheet_name, n_tables = None, 0
for sheet in parser.worksheets:
n_tables += 1
if table_name in sheet.tables:
ws, sheet_name = sheet, sheet.title
break
if sheet_name is None:
msg = (
f"table named {table_name!r} not found in sheet {sheet_name!r}"
if n_tables
else f"no named tables found in sheet {sheet_name!r} (looking for {table_name!r})"
)
raise RuntimeError(msg)
else:
ws = parser[sheet_name]

# prefer detection of actual table objects; otherwise read
# data in the used worksheet range, dropping null columns
if tables := getattr(ws, "tables", None):
table = tables[table_name] if table_name else next(iter(tables.values()))
rows = list(ws[table.ref])
if not rows:
return _empty_frame(raise_if_empty)
if has_header:
header.extend(cell.value for cell in rows.pop(0))
else:
header.extend(f"column_{n}" for n in range(1, len(rows[0]) + 1))
if table.totalsRowCount:
rows = rows[: -table.totalsRowCount]
rows_iter = rows
elif table_name:
msg = f"no named tables found in sheet {sheet_name!r} (looking for {table_name!r})"
raise RuntimeError(msg)
else:
if not has_header:
if not (rows_iter := list(ws.iter_rows())):
return _empty_frame(raise_if_empty)
n_cols = len(rows_iter[0])
header = [f"column_{n}" for n in range(1, n_cols + 1)]
else:
rows_iter = ws.iter_rows()
for row in rows_iter:
row_values = [cell.value for cell in row]
if any(v is not None for v in row_values):
header.extend(row_values)
break

dtype = String if no_inference else None
series_data = []
for name, column_data in zip(header, zip(*rows_iter)):
if name or not drop_empty_cols:
values = [cell.value for cell in column_data]
if no_inference or (dtype := (schema_overrides or {}).get(name)) == String: # type: ignore[assignment,arg-type]
# note: if we initialise the series with mixed-type data (eg: str/int)
# then the non-strings will become null, so we handle the cast here
values = [str(v) if (v is not None) else v for v in values]

s = pl.Series(name, values, dtype=dtype, strict=False)
series_data.append(s)

names = deduplicate_names(s.name for s in series_data)
df = pl.DataFrame(
dict(zip(names, series_data)),
schema_overrides=schema_overrides,
infer_schema_length=infer_schema_length,
strict=False,
)
df = _drop_null_data(
df,
raise_if_empty=raise_if_empty,
drop_empty_rows=drop_empty_rows,
drop_empty_cols=drop_empty_cols,
)
df = _reorder_columns(df, columns)
return df


def _read_spreadsheet_calamine(
parser: Any,
*,
Expand Down Expand Up @@ -1130,10 +1035,6 @@ def _read_spreadsheet_calamine(
parser_dtypes[name] = "float"
elif base_dtype == String:
parser_dtypes[name] = "string"
elif base_dtype == Datetime:
parser_dtypes[name] = "datetime"
elif base_dtype == Date:
parser_dtypes[name] = "date"
elif base_dtype == Duration:
parser_dtypes[name] = "duration"
elif base_dtype == Boolean:
Expand Down Expand Up @@ -1170,7 +1071,30 @@ def _read_spreadsheet_calamine(
# note: even if we applied parser dtypes we still re-apply schema_overrides
# natively as we can refine integer/float types, temporal precision, etc.
if schema_overrides:
df = df.cast(dtypes=schema_overrides)
lf, schema = df.lazy(), df.schema
str_to_temporal, updated_overrides = [], {}
for nm, tp in schema_overrides.items():
if schema[nm] != String:
updated_overrides[nm] = tp
elif tp == Datetime:
str_to_temporal.append(
F.col(nm).str.to_datetime(
time_unit=getattr(tp, "time_unit", None),
time_zone=getattr(tp, "time_zone", None),
)
)
elif tp == Date:
str_to_temporal.append(F.col(nm).str.to_date())
elif tp == Time:
str_to_temporal.append(F.col(nm).str.to_time())
else:
updated_overrides[nm] = tp

if str_to_temporal:
lf = lf.with_columns(*str_to_temporal)
if updated_overrides:
lf = lf.cast(dtypes=updated_overrides)
df = lf.collect()

# standardise on string dtype for null columns in empty frame
if df.is_empty():
Expand Down Expand Up @@ -1205,6 +1129,116 @@ def _read_spreadsheet_calamine(
return df


def _read_spreadsheet_openpyxl(
parser: Any,
*,
sheet_name: str | None,
read_options: dict[str, Any],
schema_overrides: SchemaDict | None,
columns: Sequence[int] | Sequence[str] | None,
table_name: str | None = None,
drop_empty_rows: bool,
drop_empty_cols: bool,
raise_if_empty: bool,
) -> pl.DataFrame:
"""Use the 'openpyxl' library to read data from the given worksheet."""
infer_schema_length = read_options.pop("infer_schema_length", None)
has_header = read_options.pop("has_header", True)
schema_overrides = schema_overrides or {}
no_inference = infer_schema_length == 0
header: list[str | None] = []

if table_name and not sheet_name:
sheet_name, n_tables = None, 0
for sheet in parser.worksheets:
n_tables += 1
if table_name in sheet.tables:
ws, sheet_name = sheet, sheet.title
break
if sheet_name is None:
msg = (
f"table named {table_name!r} not found in sheet {sheet_name!r}"
if n_tables
else f"no named tables found in sheet {sheet_name!r} (looking for {table_name!r})"
)
raise RuntimeError(msg)
else:
ws = parser[sheet_name]

# prefer detection of actual table objects; otherwise read
# data in the used worksheet range, dropping null columns
if tables := getattr(ws, "tables", None):
table = tables[table_name] if table_name else next(iter(tables.values()))
rows = list(ws[table.ref])
if not rows:
return _empty_frame(raise_if_empty)
if has_header:
header.extend(cell.value for cell in rows.pop(0))
else:
header.extend(f"column_{n}" for n in range(1, len(rows[0]) + 1))
if table.totalsRowCount:
rows = rows[: -table.totalsRowCount]
rows_iter = rows
elif table_name:
msg = f"no named tables found in sheet {sheet_name!r} (looking for {table_name!r})"
raise RuntimeError(msg)
else:
if not has_header:
if not (rows_iter := list(ws.iter_rows())):
return _empty_frame(raise_if_empty)
n_cols = len(rows_iter[0])
header = [f"column_{n}" for n in range(1, n_cols + 1)]
else:
rows_iter = ws.iter_rows()
for row in rows_iter:
row_values = [cell.value for cell in row]
if any(v is not None for v in row_values):
header.extend(row_values)
break

dtype = String if no_inference else None
series_data = []
for name, column_data in zip(header, zip(*rows_iter)):
if name or not drop_empty_cols:
values = [cell.value for cell in column_data]
if no_inference or (dtype := schema_overrides.get(name)) == String: # type: ignore[assignment,arg-type]
# note: if we initialise the series with mixed-type data (eg: str/int)
# then the non-strings will become null, so we handle the cast here
values = [str(v) if (v is not None) else v for v in values]

if (tp := schema_overrides.get(name)) in (Date, Datetime, Time): # type: ignore[operator,arg-type]
s = pl.Series(name, values, strict=False)
if s.dtype == String:
if tp == Datetime:
s = s.str.to_datetime(
time_unit=getattr(tp, "time_unit", None),
time_zone=getattr(tp, "time_zone", None),
)
elif tp == Date:
s = s.str.strip_suffix(" 00:00:00").str.to_date()
elif tp == Time:
s = s.str.to_time()
else:
s = pl.Series(name, values, dtype=dtype, strict=False)
series_data.append(s)

names = deduplicate_names(s.name for s in series_data)
df = pl.DataFrame(
dict(zip(names, series_data)),
schema_overrides=schema_overrides,
infer_schema_length=infer_schema_length,
strict=False,
)
df = _drop_null_data(
df,
raise_if_empty=raise_if_empty,
drop_empty_rows=drop_empty_rows,
drop_empty_cols=drop_empty_cols,
)
df = _reorder_columns(df, columns)
return df


def _read_spreadsheet_xlsx2csv(
parser: Any,
*,
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/series/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def to_local(self) -> Series:
@unstable()
def uses_lexical_ordering(self) -> bool:
"""
Return whether or not the series uses lexical ordering.
Indicate whether the Series uses lexical ordering.

.. warning::
This functionality is considered **unstable**. It may be changed
Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2060,9 +2060,9 @@ def find_many(
overlapping: bool = False,
) -> Series:
"""
Use the Aho-Corasick algorithm to find many matches.
Use the Aho-Corasick algorithm to find all matches.

The function will return the bytes offset of the start of each match.
The function returns the byte offset of the start of each match.
The return type will be `List<UInt32>`

Parameters
Expand Down
Binary file modified py-polars/tests/unit/io/files/example.xls
Binary file not shown.
Binary file modified py-polars/tests/unit/io/files/example.xlsb
Binary file not shown.
Binary file modified py-polars/tests/unit/io/files/example.xlsx
Binary file not shown.
Loading
Loading