diff --git a/harvest-planner/.dockerignore b/harvest-planner/.dockerignore new file mode 100644 index 0000000..ead0e6f --- /dev/null +++ b/harvest-planner/.dockerignore @@ -0,0 +1,7 @@ +.dockerignore +.git +.idea +.ruff_cache +.venv +objects.json +ruff.toml diff --git a/harvest-planner/.gitignore b/harvest-planner/.gitignore new file mode 100644 index 0000000..ae8554d --- /dev/null +++ b/harvest-planner/.gitignore @@ -0,0 +1,10 @@ +# python generated files +__pycache__/ +*.py[oc] +build/ +dist/ +wheels/ +*.egg-info + +# venv +.venv diff --git a/harvest-planner/.python-version b/harvest-planner/.python-version new file mode 100644 index 0000000..530fe91 --- /dev/null +++ b/harvest-planner/.python-version @@ -0,0 +1 @@ +3.11.9 \ No newline at end of file diff --git a/harvest-planner/Dockerfile b/harvest-planner/Dockerfile new file mode 100644 index 0000000..dbeea0a --- /dev/null +++ b/harvest-planner/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.11 AS build-env + +ARG version=3.11 +ARG APP=app + +COPY . ./$APP + +WORKDIR /$APP + +COPY requirements.lock /$APP/ +RUN sed '/-e/d' requirements.lock > requirements.txt +RUN --mount=type=cache,target=/root/.cache \ + pip install -r ./requirements.txt + +FROM gcr.io/distroless/python3-debian12:debug +ARG version=3.11 +ARG APP=app + +COPY --from=build-env /$APP /$APP +COPY --from=build-env /usr/local/lib/python${version}/site-packages /usr/local/lib/python${version}/site-packages + +WORKDIR /$APP +ENV PYTHONPATH=/usr/local/lib/python${version}/site-packages +ENTRYPOINT [ "python", "src/harvest_planner/main.py" ] \ No newline at end of file diff --git a/harvest-planner/README.md b/harvest-planner/README.md new file mode 100644 index 0000000..b2a22ef --- /dev/null +++ b/harvest-planner/README.md @@ -0,0 +1,30 @@ +# Harvest Planner + +Harvest Planner estimates how much memory each poller needs to monitor ONTAP and StorageGRID clusters. +Here's how to use it: + +1. Run the following Harvest command to gather object counts from you cluster(s) + `bin/harvest planner -p poller` # one cluster + `bin/harvest planner` # multiple clusters + `bin/harvest planner --docker` # multiple clusters and run the following Docker command for you + +The planner command will create a `objects.json` file that contains the object counts for each cluster. + +2. Run the following Docker command to estimate how much memory each poller needs to monitor its cluster. + +```bash +docker run --rm \ + --volume "$(pwd)/objects.json:/objects.json" \ + ghcr.io/netapp/harvest-planner \ + estimate-memory -i /objects.json +``` + +# Development + +Harvest-planner is written in Python and uses [Rye](https://rye.astral.sh/) for development. + +To get started, install Rye, clone the repo, cd into `harvest-metrics/harvest-planner` and run the following command: + +```bash +rye sync +``` \ No newline at end of file diff --git a/harvest-planner/models/gbr_model.pkl b/harvest-planner/models/gbr_model.pkl new file mode 100644 index 0000000..2edb2e1 Binary files /dev/null and b/harvest-planner/models/gbr_model.pkl differ diff --git a/harvest-planner/models/gbr_scaler.pkl b/harvest-planner/models/gbr_scaler.pkl new file mode 100644 index 0000000..f2f7307 Binary files /dev/null and b/harvest-planner/models/gbr_scaler.pkl differ diff --git a/harvest-planner/objects.json.example b/harvest-planner/objects.json.example new file mode 100644 index 0000000..c7b1c34 --- /dev/null +++ b/harvest-planner/objects.json.example @@ -0,0 +1,116 @@ +[ + { + "DiskConfig": 2040, + "DiskPerf": 6400, + "LunConfig": 0, + "LunPerf": 0, + "NFSClientsConfig": 0, + "Poller": "cluster-01", + "QtreeConfig": 1847, + "QtreePerf": 0, + "SVMConfig": 24, + "SensorConfig": 2400, + "SnapMirrorConfig": 0, + "SnapshotConfig": 180, + "StorageGridSG": 0, + "VolumeAnalyticsConfig": 0, + "VolumeConfig": 30492, + "VolumePerf": 9242, + "WorkloadDetailVolumePerf": 91296 + }, + { + "DiskConfig": 240, + "DiskPerf": 640, + "LunConfig": 0, + "LunPerf": 0, + "NFSClientsConfig": 0, + "Poller": "cluster-02", + "QtreeConfig": 0, + "QtreePerf": 0, + "SVMConfig": 24, + "SensorConfig": 0, + "SnapMirrorConfig": 0, + "SnapshotConfig": 0, + "StorageGridSG": 0, + "VolumeAnalyticsConfig": 1735, + "VolumeConfig": 30492, + "VolumePerf": 9242, + "WorkloadDetailVolumePerf": 91296 + }, + { + "DiskConfig": 23, + "DiskPerf": 64, + "LunConfig": 15, + "LunPerf": 6, + "NFSClientsConfig": 0, + "Poller": "sar", + "QtreeConfig": 935, + "QtreePerf": 938, + "SVMConfig": 87, + "SensorConfig": 186, + "SnapMirrorConfig": 684, + "SnapshotConfig": 1703, + "StorageGridSG": 0, + "VolumeAnalyticsConfig": 4, + "VolumeConfig": 940, + "VolumePerf": 922, + "WorkloadDetailVolumePerf": 0 + }, + { + "DiskConfig": 48, + "DiskPerf": 63, + "LunConfig": 15, + "LunPerf": 6, + "NFSClientsConfig": 0, + "Poller": "F2240-127-26", + "QtreeConfig": 862, + "QtreePerf": 17, + "SVMConfig": 273, + "SensorConfig": 0, + "SnapMirrorConfig": 89, + "SnapshotConfig": 0, + "StorageGridSG": 0, + "VolumeAnalyticsConfig": 0, + "VolumeConfig": 918, + "VolumePerf": 908, + "WorkloadDetailVolumePerf": 0 + }, + { + "DiskConfig": 24, + "DiskPerf": 66, + "LunConfig": 133, + "LunPerf": 133, + "NFSClientsConfig": 0, + "Poller": "nikhita", + "QtreeConfig": 0, + "QtreePerf": 86, + "SVMConfig": 12, + "SensorConfig": 242, + "SnapMirrorConfig": 1, + "SnapshotConfig": 2433, + "StorageGridSG": 0, + "VolumeAnalyticsConfig": 0, + "VolumeConfig": 90, + "VolumePerf": 89, + "WorkloadDetailVolumePerf": 73 + }, + { + "DiskConfig": 792, + "DiskPerf": 1144, + "LunConfig": 0, + "LunPerf": 0, + "NFSClientsConfig": 0, + "Poller": "nasclu01", + "QtreeConfig": 325, + "QtreePerf": 4228, + "SVMConfig": 92, + "SensorConfig": 1116, + "SnapMirrorConfig": 283, + "SnapshotConfig": 0, + "StorageGridSG": 0, + "VolumeAnalyticsConfig": 1, + "VolumeConfig": 4028, + "VolumePerf": 4020, + "WorkloadDetailVolumePerf": 0 + } +] \ No newline at end of file diff --git a/harvest-planner/pyproject.toml b/harvest-planner/pyproject.toml new file mode 100644 index 0000000..12b8ed3 --- /dev/null +++ b/harvest-planner/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "harvest-planner" +version = "0.1.0" +description = "Harvest Planner estimates how much memory each poller needs to monitor ONTAP and StorageGRID clusters" +dependencies = [ + "joblib>=1.4.2", + "numpy>=2.0.0", + "pandas>=2.2.2", + "scikit-learn>=1.5.0", + "scipy>=1.13.1", +] +readme = "README.md" +requires-python = ">= 3.8" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.rye] +managed = true +dev-dependencies = [] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = ["src/harvest_planner"] diff --git a/harvest-planner/requirements-dev.lock b/harvest-planner/requirements-dev.lock new file mode 100644 index 0000000..312c7a9 --- /dev/null +++ b/harvest-planner/requirements-dev.lock @@ -0,0 +1,36 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false + +-e file:. +joblib==1.4.2 + # via harvest-planner + # via scikit-learn +numpy==2.0.0 + # via harvest-planner + # via pandas + # via scikit-learn + # via scipy +pandas==2.2.2 + # via harvest-planner +python-dateutil==2.9.0.post0 + # via pandas +pytz==2024.1 + # via pandas +scikit-learn==1.5.0 + # via harvest-planner +scipy==1.14.0 + # via harvest-planner + # via scikit-learn +six==1.16.0 + # via python-dateutil +threadpoolctl==3.5.0 + # via scikit-learn +tzdata==2024.1 + # via pandas diff --git a/harvest-planner/requirements.lock b/harvest-planner/requirements.lock new file mode 100644 index 0000000..312c7a9 --- /dev/null +++ b/harvest-planner/requirements.lock @@ -0,0 +1,36 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false + +-e file:. +joblib==1.4.2 + # via harvest-planner + # via scikit-learn +numpy==2.0.0 + # via harvest-planner + # via pandas + # via scikit-learn + # via scipy +pandas==2.2.2 + # via harvest-planner +python-dateutil==2.9.0.post0 + # via pandas +pytz==2024.1 + # via pandas +scikit-learn==1.5.0 + # via harvest-planner +scipy==1.14.0 + # via harvest-planner + # via scikit-learn +six==1.16.0 + # via python-dateutil +threadpoolctl==3.5.0 + # via scikit-learn +tzdata==2024.1 + # via pandas diff --git a/harvest-planner/ruff.toml b/harvest-planner/ruff.toml new file mode 100644 index 0000000..a2d0db7 --- /dev/null +++ b/harvest-planner/ruff.toml @@ -0,0 +1,26 @@ +[lint] +select = [ + "ARG", + "B", + "E", + "ERA", + "F", + "I", + "ISC", + "NPY", + "PD", + "PERF", + "PIE", + "PL", + "PTH", + "PYI", + "Q", + "RET", + "RUF", + "S", + "SIM", + "UP", + "YTT", +] + +ignore = ["ISC001"] \ No newline at end of file diff --git a/harvest-planner/src/harvest_planner/__init__.py b/harvest-planner/src/harvest_planner/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/harvest-planner/src/harvest_planner/main.py b/harvest-planner/src/harvest_planner/main.py new file mode 100644 index 0000000..344e0a1 --- /dev/null +++ b/harvest-planner/src/harvest_planner/main.py @@ -0,0 +1,254 @@ +import argparse +import pathlib + +import joblib +import numpy as np +import pandas as pd +from sklearn.ensemble import GradientBoostingRegressor +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler + +# Constants +MODEL_PATH = "models/gbr_model.pkl" +SCALER_PATH = "models/gbr_scaler.pkl" + +BYTES_PER_MD = 1024 * 1024 +POLLER = "Poller" +PLUS_SOME = 1.2 # 20% more memory than predicted +PREDICTED_MB = "EstimatedMB" +PREDICTED_RSS = "PredictedRss" +RSS_BYTES = "RssBytes" +RSS_MB = "RssMB" +HARVESTED_FEATURES = [ + "DiskConfig", + "DiskPerf", + "LunConfig", + "LunPerf", + "NFSClientsConfig", + "QtreeConfig", + "QtreePerf", + "SVMConfig", + "SensorConfig", + "SnapMirrorConfig", + "SnapshotConfig", + "StorageGridSG", + "VolumeAnalyticsConfig", + "VolumeConfig", + "VolumePerf", + "WorkloadDetailVolumePerf", +] + + +def train_model(args): + # check that the input file exists + if not args.input.exists(): + print(f'Error: The input file "{args.input}" does not exist.') + return + + # Load the CSV file + try: + data = pd.read_csv(args.input) + except Exception as e: + print(f'Error: Failed to read the input file "{args.input}". {e}') + return + + # Prepare the data using the selected features + x_selected = data[HARVESTED_FEATURES] + y = data[RSS_BYTES] + + # Split the data into training and testing sets + x_train, x_test, y_train, y_test = train_test_split( + x_selected, y, test_size=0.2, random_state=42 + ) + + # Standardize the features + scaler = StandardScaler() + x_train_scaled = scaler.fit_transform(x_train) + x_test_scaled = scaler.transform(x_test) + x_selected_scaled = scaler.transform(x_selected) + + # Set common parameters for GradientBoostingRegressor + params = { + "n_estimators": 100, + "learning_rate": 0.1, + "max_depth": 6, + "min_samples_split": 2, + "min_samples_leaf": 1, + "subsample": 0.9, + "max_features": 0.9, + "random_state": 42, + } + + # Train the model with the common parameters + gbr = GradientBoostingRegressor(**params) + gbr.fit(x_train_scaled, y_train) + + # Save the model and scaler to disk + model_file_path = MODEL_PATH + scaler_file_path = SCALER_PATH + joblib.dump(gbr, model_file_path) + joblib.dump(scaler, scaler_file_path) + + # Predict using the loaded GradientBoostingRegressor model + data[PREDICTED_RSS] = gbr.predict(x_selected_scaled) + y_train_pred = gbr.predict(x_train_scaled) + y_test_pred = gbr.predict(x_test_scaled) + + # Create new columns RssMB and PredictedMB + data[RSS_MB] = data[RSS_BYTES] / BYTES_PER_MD + data[PREDICTED_MB] = data[PREDICTED_RSS] / BYTES_PER_MD + + # Save the updated DataFrame to a new CSV file + if args.save: + output_file_path = args.save + data.to_csv(output_file_path, index=False) + + # Evaluate the model performance on training data + r2_train = r2_score(y_train, y_train_pred) + mae_train = mean_absolute_error(y_train, y_train_pred) + rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred)) + + # Evaluate the model performance on test data + r2_test = r2_score(y_test, y_test_pred) + mae_test = mean_absolute_error(y_test, y_test_pred) + rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred)) + + # Display the first few rows of the updated DataFrame to verify the results + print(data[[RSS_BYTES, RSS_MB, PREDICTED_RSS, PREDICTED_MB]].head()) + + # Create a DataFrame to store evaluation metrics + metrics_df = pd.DataFrame( + { + "Dataset": ["Training", "Test"], + "R^2": [r2_train, r2_test], + "MAE": [mae_train, mae_test], + "RMSE": [rmse_train, rmse_test], + } + ) + + print("\nModel evaluation metrics:") + # Print the DataFrame + print(metrics_df) + + +# Validate input +def validate_input(df): + if POLLER not in df.columns: + print('Error: The DataFrame does not contain a "Poller" column.') + return False + + is_valid = True + nan_indices = np.where(pd.isna(df)) + + if len(nan_indices[0]) == 0: + return True + + for row, col in zip(*nan_indices): + poller_value = df.loc[row, POLLER] + poller_name = "unnamed" if pd.isna(poller_value) else poller_value + print(f'Poller "{poller_name}" is missing the required key: {df.columns[col]}') + is_valid = False + + return is_valid + + +def predict_size(args): + # check that the input file exists + if not args.input.exists(): + print(f'Error: The input file "{args.input}" does not exist.') + return + + # Load the input JSON file + try: + input_data = pd.read_json(args.input) + except Exception as e: + print(f'Error: Failed to read the input file "{args.input}". {e}') + return + + is_valid = validate_input(input_data) + if not is_valid: + return + + # Load the model and scaler + model_file_path = MODEL_PATH + scaler_file_path = SCALER_PATH + gbr = joblib.load(model_file_path) + scaler = joblib.load(scaler_file_path) + + # Prepare the input data using the selected features + x_input = input_data[HARVESTED_FEATURES] + x_input_scaled = scaler.transform(x_input) + + # Predict the memory size and add 20% (PLUS_SOME) more memory + input_data[PREDICTED_RSS] = gbr.predict(x_input_scaled) + input_data[PREDICTED_MB] = input_data[PREDICTED_RSS] * PLUS_SOME / BYTES_PER_MD + + # Round the predicted memory size to the nearest integer and print with no decimals + input_data[PREDICTED_MB] = input_data[PREDICTED_MB].round(0).astype(int) + + # Calculate the total predicted memory size + total_predicted_mb = input_data[PREDICTED_MB].sum() + + # Add a summary row to the DataFrame + summary_row = pd.DataFrame({ + POLLER: "Total", + PREDICTED_MB: [total_predicted_mb], + }) + input_data = pd.concat([input_data, summary_row], ignore_index=True) + + # Left justify the poller column + input_data[POLLER] = input_data[POLLER].apply(lambda x: f"{x:<}") + + # Display the input data with the predicted memory size + print(input_data[[POLLER, PREDICTED_MB]].to_string(index=False)) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Planner") + sub_parsers = parser.add_subparsers(dest="command", help="Available commands") + + # Create a parser for the train command + train_parser = sub_parsers.add_parser("train", help="Train the model") + train_parser.add_argument( + "-i", + "--input", + type=pathlib.Path, + required=True, + help="CSV file with the training data", + ) + train_parser.add_argument( + "-s", + "--save", + type=pathlib.Path, + help="Path of to save the input file with predictions", + ) + + # Create a parser for the estimateMemory command + predict_parser = sub_parsers.add_parser( + "estimate-memory", help="Estimate the amount of memory needed" + ) + predict_parser.add_argument( + "-i", + "--input", + type=pathlib.Path, + required=True, + help="Object counts JSON file from bin/harvest planner", + ) + + args = parser.parse_args() + + if args.command == "train": + train_model(args) + elif args.command == "estimate-memory": + predict_size(args) + else: + parser.print_help() + + +def main(): + parse_args() + + +if __name__ == "__main__": + main()