Skip to content

Commit

Permalink
Fix reading CSV in commands (#470)
Browse files Browse the repository at this point in the history
  • Loading branch information
fuglaeff authored Sep 5, 2024
1 parent 72ab4dd commit 1526e36
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 6 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fix `IForestOutlierTransform` failed with ignored `target` column ([#460](https://github.com/etna-team/etna/pull/460))
- Add lower limit for `typing_extension` versions ([#458](https://github.com/etna-team/etna/pull/458))
- Fix `ModelDecomposeTransform` import without `prophet` module ([#459](https://github.com/etna-team/etna/pull/459))
-
- Convert `segment` to string during reading csv in `backtest` and `forecast` commands ([#470](https://github.com/etna-team/etna/pull/470))
-
-
- Fix holidays during loading datasets `traffic_2008_10T` and `traffic_2008_hourly` ([#462](https://github.com/etna-team/etna/pull/462))
Expand Down
4 changes: 2 additions & 2 deletions etna/commands/backtest_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,13 @@ def backtest(
freq_init = freq
parse_dates = ["timestamp"]

df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates)
df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates, dtype={"segment": str})
df_timeseries = TSDataset.to_dataset(df_timeseries)

df_exog = None
k_f: Union[Literal["all"], Sequence[Any]] = ()
if exog_path:
df_exog = pd.read_csv(exog_path, parse_dates=parse_dates)
df_exog = pd.read_csv(exog_path, parse_dates=parse_dates, dtype={"segment": str})
df_exog = TSDataset.to_dataset(df_exog)
k_f = "all" if not known_future else known_future

Expand Down
5 changes: 2 additions & 3 deletions etna/commands/forecast_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,13 @@ def forecast(
freq_init = freq
parse_dates = ["timestamp"]

df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates)

df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates, dtype={"segment": str})
df_timeseries = TSDataset.to_dataset(df_timeseries)

df_exog = None
k_f: Union[Literal["all"], Sequence[Any]] = ()
if exog_path:
df_exog = pd.read_csv(exog_path, parse_dates=parse_dates)
df_exog = pd.read_csv(exog_path, parse_dates=parse_dates, dtype={"segment": str})
df_exog = TSDataset.to_dataset(df_exog)
k_f = "all" if not known_future else known_future

Expand Down
35 changes: 35 additions & 0 deletions tests/test_commands/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,3 +232,38 @@ def small_ts():
df = pd.DataFrame({"segment": ["segment_0"], "timestamp": [pd.Timestamp("2020-01-01")], "target": [1]})
df = TSDataset.to_dataset(df=df)
return TSDataset(df=df, freq="D")


@pytest.fixture
def base_timeseries_numeric_segments_path():
df = pd.DataFrame(
{
"timestamp": list(pd.date_range("2021-06-01", periods=100)) * 2,
"target": np.arange(200),
# segments with numeric names and leading zeros
"segment": ["01234"] * 100 + ["12345"] * 100,
}
)
tmp = NamedTemporaryFile("w")
df.to_csv(tmp, index=False)
tmp.flush()
yield Path(tmp.name)
tmp.close()


@pytest.fixture
def base_timeseries_numeric_segments_exog_path():
df_regressors = pd.DataFrame(
{
"timestamp": list(pd.date_range("2021-06-01", periods=120)) * 2,
"regressor_1": np.arange(240),
"regressor_2": np.arange(240) + 5,
# segments with numeric names and leading zeros
"segment": ["01234"] * 120 + ["12345"] * 120,
}
)
tmp = NamedTemporaryFile("w")
df_regressors.to_csv(tmp, index=False)
tmp.flush()
yield Path(tmp.name)
tmp.close()
54 changes: 54 additions & 0 deletions tests/test_commands/test_backtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,57 @@ def test_backtest_estimate_n_folds(
)
forecast_df = pd.read_csv(tmp_output_path / "forecast.csv")
assert forecast_df["fold_number"].nunique() == expected


def test_backtest_with_numeric_segments(
base_pipeline_yaml_path,
base_backtest_yaml_path,
base_timeseries_numeric_segments_path,
):
target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str})
segments = target["segment"].unique()

tmp_output = TemporaryDirectory()
tmp_output_path = Path(tmp_output.name)
run(
[
"etna",
"backtest",
str(base_pipeline_yaml_path),
str(base_backtest_yaml_path),
str(base_timeseries_numeric_segments_path),
"D",
str(tmp_output_path),
]
)
df_forecast = pd.read_csv(tmp_output_path / "forecast.csv", dtype={"segment": str})
output_segments = df_forecast["segment"].unique()
assert set(segments) == set(output_segments)


def test_backtest_with_numeric_segments_with_exog(
base_pipeline_yaml_path,
base_backtest_yaml_path,
base_timeseries_numeric_segments_path,
base_timeseries_numeric_segments_exog_path,
):
target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str})
segments = target["segment"].unique()

tmp_output = TemporaryDirectory()
tmp_output_path = Path(tmp_output.name)
run(
[
"etna",
"backtest",
str(base_pipeline_yaml_path),
str(base_backtest_yaml_path),
str(base_timeseries_numeric_segments_path),
"D",
str(tmp_output_path),
str(base_timeseries_numeric_segments_exog_path),
]
)
df_forecast = pd.read_csv(tmp_output_path / "forecast.csv", dtype={"segment": str})
output_segments = df_forecast["segment"].unique()
assert set(segments) == set(output_segments)
56 changes: 56 additions & 0 deletions tests/test_commands/test_forecast.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,3 +409,59 @@ def test_forecast_with_estimate_n_folds(

assert all(x in df_output.columns for x in ["target_0.025", "target_0.975"])
assert len(df_output) == 4 * 2 # 4 predictions for 2 segments


def test_forecast_with_numeric_segments(
base_pipeline_yaml_path,
base_timeseries_numeric_segments_path,
):
target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str})
segments = target["segment"].unique()

tmp_output = NamedTemporaryFile("w")
tmp_output_path = Path(tmp_output.name)
run(
[
"etna",
"forecast",
str(base_pipeline_yaml_path),
str(base_timeseries_numeric_segments_path),
"D",
str(tmp_output_path),
],
)
df_output = pd.read_csv(tmp_output_path, dtype={"segment": str})
output_segments = df_output["segment"].unique()
assert set(segments) == set(output_segments)


@pytest.mark.parametrize(
"pipeline_path_name",
("base_pipeline_yaml_path", "base_ensemble_yaml_path"),
)
def test_forecast_with_numeric_segments_with_exog(
pipeline_path_name,
base_timeseries_numeric_segments_path,
base_timeseries_numeric_segments_exog_path,
request,
):
target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str})
segments = target["segment"].unique()

tmp_output = NamedTemporaryFile("w")
tmp_output_path = Path(tmp_output.name)
pipeline_path = request.getfixturevalue(pipeline_path_name)
run(
[
"etna",
"forecast",
str(pipeline_path),
str(base_timeseries_numeric_segments_path),
"D",
str(tmp_output_path),
str(base_timeseries_numeric_segments_exog_path),
],
)
df_output = pd.read_csv(tmp_output_path, dtype={"segment": str})
output_segments = df_output["segment"].unique()
assert set(segments) == set(output_segments)

0 comments on commit 1526e36

Please sign in to comment.