Fix reading CSV in commands (#470)

etna-team · Sep 5, 2024 · 1526e36 · 1526e36
1 parent 72ab4dd
commit 1526e36
Show file tree

Hide file tree

Showing 6 changed files with 150 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -54,7 +54,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fix `IForestOutlierTransform` failed with ignored `target` column ([#460](https://github.com/etna-team/etna/pull/460))
 - Add lower limit for `typing_extension` versions ([#458](https://github.com/etna-team/etna/pull/458))
 - Fix `ModelDecomposeTransform` import without `prophet` module ([#459](https://github.com/etna-team/etna/pull/459))
-- 
+- Convert `segment` to string during reading csv in `backtest` and `forecast` commands ([#470](https://github.com/etna-team/etna/pull/470))
 - 
 - 
 - Fix holidays during loading datasets `traffic_2008_10T` and `traffic_2008_hourly` ([#462](https://github.com/etna-team/etna/pull/462))

diff --git a/etna/commands/backtest_command.py b/etna/commands/backtest_command.py
@@ -82,13 +82,13 @@ def backtest(
         freq_init = freq
         parse_dates = ["timestamp"]
 
-    df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates)
+    df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates, dtype={"segment": str})
     df_timeseries = TSDataset.to_dataset(df_timeseries)
 
     df_exog = None
     k_f: Union[Literal["all"], Sequence[Any]] = ()
     if exog_path:
-        df_exog = pd.read_csv(exog_path, parse_dates=parse_dates)
+        df_exog = pd.read_csv(exog_path, parse_dates=parse_dates, dtype={"segment": str})
         df_exog = TSDataset.to_dataset(df_exog)
         k_f = "all" if not known_future else known_future
 

diff --git a/etna/commands/forecast_command.py b/etna/commands/forecast_command.py
@@ -129,14 +129,13 @@ def forecast(
         freq_init = freq
         parse_dates = ["timestamp"]
 
-    df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates)
-
+    df_timeseries = pd.read_csv(target_path, parse_dates=parse_dates, dtype={"segment": str})
     df_timeseries = TSDataset.to_dataset(df_timeseries)
 
     df_exog = None
     k_f: Union[Literal["all"], Sequence[Any]] = ()
     if exog_path:
-        df_exog = pd.read_csv(exog_path, parse_dates=parse_dates)
+        df_exog = pd.read_csv(exog_path, parse_dates=parse_dates, dtype={"segment": str})
         df_exog = TSDataset.to_dataset(df_exog)
         k_f = "all" if not known_future else known_future
 

diff --git a/tests/test_commands/conftest.py b/tests/test_commands/conftest.py
@@ -232,3 +232,38 @@ def small_ts():
     df = pd.DataFrame({"segment": ["segment_0"], "timestamp": [pd.Timestamp("2020-01-01")], "target": [1]})
     df = TSDataset.to_dataset(df=df)
     return TSDataset(df=df, freq="D")
+
+
+@pytest.fixture
+def base_timeseries_numeric_segments_path():
+    df = pd.DataFrame(
+        {
+            "timestamp": list(pd.date_range("2021-06-01", periods=100)) * 2,
+            "target": np.arange(200),
+            # segments with numeric names and leading zeros
+            "segment": ["01234"] * 100 + ["12345"] * 100,
+        }
+    )
+    tmp = NamedTemporaryFile("w")
+    df.to_csv(tmp, index=False)
+    tmp.flush()
+    yield Path(tmp.name)
+    tmp.close()
+
+
+@pytest.fixture
+def base_timeseries_numeric_segments_exog_path():
+    df_regressors = pd.DataFrame(
+        {
+            "timestamp": list(pd.date_range("2021-06-01", periods=120)) * 2,
+            "regressor_1": np.arange(240),
+            "regressor_2": np.arange(240) + 5,
+            # segments with numeric names and leading zeros
+            "segment": ["01234"] * 120 + ["12345"] * 120,
+        }
+    )
+    tmp = NamedTemporaryFile("w")
+    df_regressors.to_csv(tmp, index=False)
+    tmp.flush()
+    yield Path(tmp.name)
+    tmp.close()
diff --git a/tests/test_commands/test_backtest.py b/tests/test_commands/test_backtest.py
@@ -180,3 +180,57 @@ def test_backtest_estimate_n_folds(
     )
     forecast_df = pd.read_csv(tmp_output_path / "forecast.csv")
     assert forecast_df["fold_number"].nunique() == expected
+
+
+def test_backtest_with_numeric_segments(
+    base_pipeline_yaml_path,
+    base_backtest_yaml_path,
+    base_timeseries_numeric_segments_path,
+):
+    target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str})
+    segments = target["segment"].unique()
+
+    tmp_output = TemporaryDirectory()
+    tmp_output_path = Path(tmp_output.name)
+    run(
+        [
+            "etna",
+            "backtest",
+            str(base_pipeline_yaml_path),
+            str(base_backtest_yaml_path),
+            str(base_timeseries_numeric_segments_path),
+            "D",
+            str(tmp_output_path),
+        ]
+    )
+    df_forecast = pd.read_csv(tmp_output_path / "forecast.csv", dtype={"segment": str})
+    output_segments = df_forecast["segment"].unique()
+    assert set(segments) == set(output_segments)
+
+
+def test_backtest_with_numeric_segments_with_exog(
+    base_pipeline_yaml_path,
+    base_backtest_yaml_path,
+    base_timeseries_numeric_segments_path,
+    base_timeseries_numeric_segments_exog_path,
+):
+    target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str})
+    segments = target["segment"].unique()
+
+    tmp_output = TemporaryDirectory()
+    tmp_output_path = Path(tmp_output.name)
+    run(
+        [
+            "etna",
+            "backtest",
+            str(base_pipeline_yaml_path),
+            str(base_backtest_yaml_path),
+            str(base_timeseries_numeric_segments_path),
+            "D",
+            str(tmp_output_path),
+            str(base_timeseries_numeric_segments_exog_path),
+        ]
+    )
+    df_forecast = pd.read_csv(tmp_output_path / "forecast.csv", dtype={"segment": str})
+    output_segments = df_forecast["segment"].unique()
+    assert set(segments) == set(output_segments)
diff --git a/tests/test_commands/test_forecast.py b/tests/test_commands/test_forecast.py
@@ -409,3 +409,59 @@ def test_forecast_with_estimate_n_folds(
 
     assert all(x in df_output.columns for x in ["target_0.025", "target_0.975"])
     assert len(df_output) == 4 * 2  # 4 predictions for 2 segments
+
+
+def test_forecast_with_numeric_segments(
+    base_pipeline_yaml_path,
+    base_timeseries_numeric_segments_path,
+):
+    target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str})
+    segments = target["segment"].unique()
+
+    tmp_output = NamedTemporaryFile("w")
+    tmp_output_path = Path(tmp_output.name)
+    run(
+        [
+            "etna",
+            "forecast",
+            str(base_pipeline_yaml_path),
+            str(base_timeseries_numeric_segments_path),
+            "D",
+            str(tmp_output_path),
+        ],
+    )
+    df_output = pd.read_csv(tmp_output_path, dtype={"segment": str})
+    output_segments = df_output["segment"].unique()
+    assert set(segments) == set(output_segments)
+
+
+@pytest.mark.parametrize(
+    "pipeline_path_name",
+    ("base_pipeline_yaml_path", "base_ensemble_yaml_path"),
+)
+def test_forecast_with_numeric_segments_with_exog(
+    pipeline_path_name,
+    base_timeseries_numeric_segments_path,
+    base_timeseries_numeric_segments_exog_path,
+    request,
+):
+    target = pd.read_csv(base_timeseries_numeric_segments_path, dtype={"segment": str})
+    segments = target["segment"].unique()
+
+    tmp_output = NamedTemporaryFile("w")
+    tmp_output_path = Path(tmp_output.name)
+    pipeline_path = request.getfixturevalue(pipeline_path_name)
+    run(
+        [
+            "etna",
+            "forecast",
+            str(pipeline_path),
+            str(base_timeseries_numeric_segments_path),
+            "D",
+            str(tmp_output_path),
+            str(base_timeseries_numeric_segments_exog_path),
+        ],
+    )
+    df_output = pd.read_csv(tmp_output_path, dtype={"segment": str})
+    output_segments = df_output["segment"].unique()
+    assert set(segments) == set(output_segments)