From 7fe26082013cb67b4fd95a39315f7d011b05e190 Mon Sep 17 00:00:00 2001 From: clumsy9 Date: Tue, 24 Oct 2023 01:37:54 +0200 Subject: [PATCH] Artifacts (#4) * Revise Dockerfile to execute experiments and environment as non-root user * Add 'Running Your Own Experiments' to README.md * Add tox.ini to execute unit tests using tox * Add requirements_dev.in and requirements_dev.txt to prepare environment for unit tests * Add section on how to execute tests to README.md. * Fix wrong file name in Dockerfile * Fix typos in README.md and add code documentation. * Emphasize that automation scripts need to be executed from the project root directory * Remove obsolete code in amides/tests and amides.evaluation --------- Co-authored-by: Rafael Uetz --- Dockerfile | 20 +- README.md | 289 +++++++++++++-- amides/amides/data.py | 116 +++--- amides/amides/evaluation.py | 342 ++++-------------- amides/amides/events.py | 18 +- amides/amides/features/deduplicate.py | 79 +++- amides/amides/features/extraction.py | 46 ++- amides/amides/features/filter.py | 20 +- amides/amides/features/normalize.py | 15 + amides/amides/features/preprocessing.py | 9 +- amides/amides/features/tokenization.py | 28 +- amides/amides/models/selection.py | 28 +- amides/amides/persist.py | 56 ++- amides/amides/scale.py | 100 ++--- amides/amides/sigma.py | 14 +- amides/amides/utils.py | 67 ++-- amides/amides/visualization.py | 18 +- amides/bin/add_scaler.py | 5 + amides/bin/combine_models.py | 9 +- amides/bin/confidence.py | 11 +- .../pr_plot_powershell_proxy_registry.json | 2 +- .../config/process_creation/attr_plot.json | 2 +- .../process_creation/pr_plot_tainted.json | 2 +- .../prt_plot_misuse_rules_matches.json | 2 +- amides/bin/empty_attr.py | 90 ----- amides/bin/eval_attr.py | 0 amides/bin/eval_mcc_scaling.py | 74 +--- amides/bin/extract_features.py | 3 + amides/bin/extract_terms.py | 57 +-- amides/bin/extract_terms_multi.py | 49 +-- amides/bin/normalize.py | 11 +- amides/bin/plot_attr.py | 19 +- amides/bin/plot_df_hist.py | 3 + amides/bin/plot_df_values.py | 7 +- amides/bin/plot_multi_tainted.py | 14 +- amides/bin/plot_pr.py | 12 +- amides/bin/results.sh | 45 --- amides/bin/split_terms.py | 15 +- amides/bin/train.py | 90 +++-- amides/bin/train_new_types.py | 1 + amides/bin/train_new_types_multi.py | 2 + amides/bin/validate.py | 75 ++-- amides/bin/validate_new_types.py | 3 + amides/bin/validate_new_types_multi.py | 9 +- amides/classification.sh | 10 + amides/classification_other_types.sh | 15 + amides/experiments.sh | 13 + amides/requirements.in | 4 +- amides/requirements.txt | 2 +- amides/requirements_dev.in | 3 + amides/requirements_dev.txt | 99 +++++ amides/rule_attribution.sh | 7 + amides/setup.py | 10 +- amides/tainted_training.sh | 11 + amides/tests/data/collect_matches_evasions.py | 89 ----- amides/tests/data/json_to_jsonl.py | 0 amides/tests/data/scale_events.py | 0 .../rule_1/Proxy_Evasion_url_full_01.json | 0 .../rule_1/Proxy_Evasion_url_full_02.json | 0 .../rule_1/Proxy_Evasion_url_full_03.json | 0 .../rule_1/Proxy_Evasion_url_full_04.json | 0 .../{proxy => proxyweb}/rule_1/properties.yml | 0 .../sigma-study/rules/extract_proc_cmdline.py | 67 ---- .../rules/{proxy => proxyweb}/rule_1.yml | 0 .../Microsoft-Windows-PowerShell_4104.jsonl | 2 +- amides/tests/unit/test_data.py | 2 - amides/tests/unit/test_evaluation.py | 57 --- amides/tests/unit/test_events.py | 19 +- amides/tests/unit/test_extraction.py | 2 - amides/tests/unit/test_models_selection.py | 89 ++--- amides/tests/unit/test_persist.py | 96 +---- amides/tests/unit/test_result.py | 67 ++-- amides/tests/unit/test_sigma.py | 218 +++++------ amides/tox.ini | 13 + build_image.sh | 7 +- cleanup.sh | 23 +- create_containers.sh | 22 -- create_results.sh | 12 - remove_containers.sh | 16 +- remove_image.sh | 6 +- run_experiments.sh | 16 + start_env.sh | 9 +- 82 files changed, 1392 insertions(+), 1491 deletions(-) delete mode 100755 amides/bin/empty_attr.py mode change 100755 => 100644 amides/bin/eval_attr.py delete mode 100755 amides/bin/results.sh create mode 100755 amides/classification.sh create mode 100755 amides/classification_other_types.sh create mode 100755 amides/experiments.sh create mode 100644 amides/requirements_dev.in create mode 100644 amides/requirements_dev.txt create mode 100755 amides/rule_attribution.sh create mode 100755 amides/tainted_training.sh delete mode 100755 amides/tests/data/collect_matches_evasions.py mode change 100755 => 100644 amides/tests/data/json_to_jsonl.py mode change 100755 => 100644 amides/tests/data/scale_events.py rename amides/tests/data/sigma-study/events/{proxy => proxyweb}/rule_1/Proxy_Evasion_url_full_01.json (100%) rename amides/tests/data/sigma-study/events/{proxy => proxyweb}/rule_1/Proxy_Evasion_url_full_02.json (100%) rename amides/tests/data/sigma-study/events/{proxy => proxyweb}/rule_1/Proxy_Evasion_url_full_03.json (100%) rename amides/tests/data/sigma-study/events/{proxy => proxyweb}/rule_1/Proxy_Evasion_url_full_04.json (100%) rename amides/tests/data/sigma-study/events/{proxy => proxyweb}/rule_1/properties.yml (100%) delete mode 100755 amides/tests/data/sigma-study/rules/extract_proc_cmdline.py rename amides/tests/data/sigma-study/rules/{proxy => proxyweb}/rule_1.yml (100%) delete mode 100644 amides/tests/unit/test_evaluation.py create mode 100644 amides/tox.ini delete mode 100755 create_containers.sh delete mode 100755 create_results.sh create mode 100755 run_experiments.sh diff --git a/Dockerfile b/Dockerfile index 0dbcb02..c535c61 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,15 +2,21 @@ FROM python:3.11-slim-bullseye AS base -RUN apt-get update && apt-get upgrade -y && apt-get install -y jq +RUN addgroup --gid 1000 docker-user && \ + adduser --uid 1000 --gid 1000 --disabled-password --gecos "" docker-user && \ + echo "docker-user ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \ + apt-get update && apt-get upgrade -y && apt-get install -y jq -ADD ./amides /amides -WORKDIR /amides +ADD ./amides /home/docker-user/amides -RUN python -m venv /opt/venv -ENV PATH="/opt/venv/bin:$PATH" -RUN python -m pip install --upgrade pip && pip install -r requirements.txt && pip install . -RUN chmod +x bin/results.sh +RUN python -m venv /home/docker-user/amides/venv +ENV PATH="/home/docker-user/amides/venv/bin:$PATH" +RUN chown -R docker-user:docker-user /home/docker-user/amides + +WORKDIR /home/docker-user/amides +USER docker-user +RUN pip install --upgrade pip && pip install -r requirements_dev.txt && pip install tox && pip install -e . +RUN chmod +x experiments.sh classification.sh rule_attribution.sh tainted_training.sh classification_other_types.sh diff --git a/README.md b/README.md index c8520e6..85a7198 100644 --- a/README.md +++ b/README.md @@ -1,90 +1,304 @@ -# Adaptive Misuse Detection System (AMIDES) +

Adaptive Misuse Detection System (AMIDES)

-The Adaptive Misuse Detection System (AMIDES) extends conventional rule matching of SIEM systems by applying machine learning components that aim to detect attacks evading existing SIEM rules as well as otherwise undetected attack variants. It learns from SIEM rules and historical benign events and can thus estimate which SIEM rule was tried to be evaded. An overview of AMIDES is depicted in the figure below. +The Adaptive Misuse Detection System (AMIDES) extends conventional rule matching of SIEM systems by applying machine learning components that aim to detect attacks evading existing SIEM rules as well as otherwise undetected attack variants. It learns from SIEM rules and historical benign events and can thus estimate which SIEM rule was tried to be evaded. A brief overview of AMIDES is given in [Overview](#overview). -![amides_architecture](./docs/amides.png) +This repository contains the source code of the `amides` Python package. The package contains the modules and scripts that enable to train and validate models for AMIDES, evaluate the model's classification performance, and create meaningful visualizations that help users to assess the evaluation results. Additionally, the repository contains initial training and validation data that enables to build and evaluate models for AMIDES. + +For operational use, AMIDES is integrated into [Logprep](https://logprep.readthedocs.io/en/latest/user_manual/configuration/processor.html#amides), a pipeline-based log message preprocessor also written in Python. The package also contains additional scripts that help to prepare models for the operational use with Logprep. For more information on how to prepare AMIDES models for Logprep, please read [here](#preparing-models-for-logprep). + +## Overview - Incoming events are transformed into feature vectors by the feature extraction component. During operation, features learned during the training phase will be re-used by the feature extraction component. Feature vectors are then passed to the Misuse Classification component, which classifies events as malicious or benign. In case of a malicious result, the feature vector is passed to the Rule Attribution component, which generates a ranked list of SIEM rules potentially evaded by the event. +![amides_architecture](./docs/amides.png) -This repository contains the source code used for model training, validation, and evaluation, as well as some initial training and validation data that enable to build and evaluate models for AMIDES. -For operational use, AMIDES is integrated into [Logprep](https://logprep.readthedocs.io/en/latest/user_manual/configuration/processor.html#amides). +AMIDES is trained using a set of SIEM detection rules and historical benign events taken from an organization's corporate network. +During operation, incoming events are passed to the rule matching component and the feature extraction component which transforms the events into feature vectors. The features required for vectorization have been learned during the training phase. The feature vectors are then passed to the misuse classification component, which classifies events as malicious or benign. In case of a malicious result, the feature vector is passed to the rule attribution component, which generates a ranked list of SIEM rules potentially evaded by the event. In the final step, potential alerts of the rule matching and both machine-learning components are merged into a single alert by the alert generation component. ## System Requirements -AMIDES was developed and tested on Linux using Python 3.10. Before attempting to use AMIDES, make sure you have +AMIDES was developed and tested on Linux using Python 3.10. Before attempting to use `amides`, make sure you have - Physical or virtual host with a Linux-based OS - A minimum of 8 GB of RAM -- At least 1 GB of HDD space +- At least 2 GB of HDD space - Python 3.10 (or newer) +- jq -The repository also contains a `Dockerfile` that enables to create a quickstart environment with AMIDES and all the required dependencies already installed. To build and use the quickstart environment, Docker is required. Building and using the quickstart environment has been tested using Docker 20.10. +The repository contains a `Dockerfile` that creates a quickstart environment for the `amides` package. For testing purposes, we highly recommend to use the quickstart environment. Building and using the environment has been tested with `docker 20.10`. ## Accessing Code and Initial Data -In order to access the AMIDES source code and initial data, change into the target location on your system and clone the repository by executing +In order to access source code and initial data, change into the target location on your system and clone the repository by executing - git clone https://github.com/fkie-cad/amides.git +```bash +git clone https://github.com/fkie-cad/amides.git +``` or - git clone git@github.com:fkie-cad/amides.git +```bash +git clone git@github.com:fkie-cad/amides.git +``` in case you prefer to use SSH. Alternatively, you can get the repository by downloading the `.zip`-file from the repository's main page and unpack it into your target location. -The `amides` package in the `amides` directory contains modules and scripts that enable to train and validate models for AMIDES, evaluate the model's classification performance, and create meaningful visualizations that help users to assess the models classification performance. The package also contains additional scripts and classes that help to prepare generated models for operational use with Logprep and perform further testing. +The `amides` package is located in the `amides` directory. Initial data to train and validate models for AMIDES is provided in the `data` directory. -Initial data to train and validate models for AMIDES is provided in the `data` directory. The `socbed` folder contains a small set of benign event logs for each of the four different rule types that AMIDES was tested with: Windows Process Creation, Web Proxy, Windows Registry, and Windows PowerShell. The provided benign data was generated using [SOCBED](https://github.com/fkie-cad/socbed). +### SOCBED Datasets -Converted Sigma rules, matches, and a small number of evasions already revealed in the corresponding [academic research paper](#documentation) are located in the `data/sigma` folder. Converted rules required for model training are located in `data/sigma/rules`, matches and evasions required for model validation are located in `data/sigma/events`. +The [SOCBED](https://github.com/fkie-cad/socbed) framework was used to generate benign datasets for each of the four different SIEM rule and event types that AMIDES was tested with. The `data/socbed` folder contains a sub-folder with a small dataset for each of the event types. The samples in the `train` and `validation` files of each sub-folder have already been split and normalized for the usage with training and validation scripts. The `all` file holds both training and validation samples in a non-normalized format. The following datasets are provided: +- `windows/process_creation` - The dataset in this folder consists of process command-lines taken from the `CommandLine` field of Sysmon `ProcessCreation` events. +- `proxy_web` - This folder contains full URLs observed in web-proxy logs. +- `windows/registry` - Samples in this folder are registry keys extracted from the `TargetObject` field of Sysmon `RegistryEvent (Value Set)` and `RegistryEvent (Object Create and Delete)` events. For `Value Set` events, the samples also hold the corresponding `Details` value. +- `windows/powershell` - The samples of this data set are `ScriptBlockText` field values extracted from `Microsoft-Windows-PowerShell/Operational 4104` events. -## Installing ## +### Sigma Rules, Matches, and Evasions -Like other Python packages, the `amides` package can be installed system-wide, or into a Python virtual environment. We highly recommend using a dedicated virtual environment for `amides`. Virtual environments can be created using either the `venv` or `virtualenv` package. To create a dedicated virtual environment for `amides`, execute +The SIEM detection rules provided in this repository are converted Sigma rules. The rules have been converted using the Sigma rule converter and are located in `data/sigma/rules`. The rule types match the event types of the benign SOCBED datasets. The converted rules in `data/sigma` are thus organized in a similar folder structure as the benign datasets. - python3 -m venv +Corresponding matches, i.e., SIEM events triggering the detection rules, and a small number of evasions, i.e., matches adapted such that the executed commands achieve the exact same goal without triggering the respective rule, already revealed in the corresponding [research paper](#documentation) are provided in `data/sigma/events`. Both matches and evasions of a specific rule are organized in single `.json` files. Files with matches carry the pattern `_Match_` in their name, evasions the pattern `_Evasion_`. -or +## Getting Started + +In order to just run the [experiments](#running-experiments), we highly recommend using the quickstart environment where the `amides` package and all its dependencies are already installed. The quickstart environment can also be used if experiments on your own datasaets should be carried out. Installing `amides` onto your local system (or using a virtual environment) is also possible. + +### Building the Quickstart Environment + +Using the quickstart environment requires a `docker` installation. Building and running the environment was tested using `docker 20.10`, but it should also be compatible with other Docker versions. + +In order to build the `amides:base` image for the quickstart environment, change into the project root directory and execute + +```bash +./build_image.sh +``` + +This will execute the corresponding `docker build` command. The image is based on the `python:3.11-slim-bookworm` image. If the quickstart environment image is no longer needed at some point, it can be removed by executing the `remove_image.sh` script from the project root directory. + +### Installation + +In case you want to use `amides` without the quickstart environment, the package can also be locally installed like other Python packages. We highly recommend to use a dedicated virtual environment for `amides` though. Virtual environments are created using `venv`. To create a dedicated virtual environment, execute + +```bash +python3 -m venv +``` + +After the environment has been created, activate it by executing + +```bash +source /bin/activate +``` + +To install the `amides` package and all it's dependencies, change into the `amides` directory and execute + +```bash +pip install -r requirements.txt +pip install . +``` + +Now, the `amides` modules and scripts should be usable in your virtual environment. + +### Testing + +`tox` can be used to execute the unit tests of the `amides` package, located in `amides/tests`. `tox` can be installed from PyPI via + +```bash +pip install tox +``` + +To get an overview of the configured test environments, execute + +```bash +tox -av +``` + +in the package root folder. The `tox.ini` file currently provides configurations to run unit tests on Python 3.10 and 3.11. For example, executing + +```bash +tox -e py310-tests +``` + +will execute unit tests using Python 3.10. + +## Running Experiments + +The `amides` package comes with a bash script named `experiments.sh` which is located in the package root folder. Executing the script from the package root folder will use the SOCBED and Sigma rule data in `data` and carry out the same four experiments that stating the major claims of the corresponding [research paper](#documentation). Each of the experiments is described in more detail below. + +Since the benign datasets in this repository are generated using SOCBED, and not taken from the same enterprise network as in the research paper, the generated experiment results will look different. Hence, we provide the additional document `Paper Supplement.pdf` that shows the correct output that is to be expected when using the SOCBED datasets. +Due to responsible disclosure, most of the evasions are not available in this repository. Thus, the results produced with the small amount of evasions in this repository are different again. + +### Classification Performance + +This experiment compares AMIDES's misuse component classification performance to the benchmark approach that learns from matches (attack events) instead of the Sigma rule filters. This experiment can be carried out alone by executing `classification.sh` from the package root folder, either using your local installation or the environment container. A look into the script file reveals the scripts and configuration files used for this experiment. + +When finished, the file `figure_3_c1_c2_misuse_classification.pdf` showing precision, recall, f1-score, and mcc for the threshold range from 0 to 1 is located in the `amides/plots/process_creation` folder. + +### Rule Attribution + +This experiment evaluates the AMIDES rule attribution model's attribution performance by assessing the position of the correct detection rule in the ranked list of possibly evaded detection rules. This experiment can be run by changing into the `amides` package directory and executing the `rule_attribution.sh` script. The Python scripts used and the corresponding configuration files are revealed in the script file. + +After execution, figure `figure_4_c3_rule_attribution.pdf` in `amides/plots/process_creation` visualizes the distribution and cumulative distribution of the correct rule rankings returned by the rule attribution model. + +### Tainted Training Data + +The influence of different fractions of tainted training data onto AMIDES' misuse classification model is evaluated in this experiment. During the experiment, 10%, 20%, and 30% of the Sigma rule evasions are used to taint benign samples for the training of AMIDES' misuse classification model. During the experiment, the training data is tainted ten times for each fraction of tainted data. This specific experiment can be re-run by changing into the `amides` package folder and executing the `tainted_training.sh` script. + +Precision and recall of all 30 training runs are shown in `figure_5_c4_tainted_training.pdf`, also located in the `amides/plots/process_creation` folder. + +### Other Rule and Event Types + +The classification performance of the AMIDES misuse classification model for Windows PowerShell, Windows Registry, and Web-Proxy datasets is evaluated in this experiment. The experiment can be carried out by executing `classification_other_types.sh` from the `amides` package root folder. Precision and eecall of the models trained on the given SOCBED data are shown in `figure_6_c5_classification_other_types.pdf`, located in `amides/plots`. - python3 -m virtualenv +## Running Experiments using the Quickstart Environment -in case you want to use `virtualenv`. After the environment has been created, activate it by executing +After the image of the quickstart environment has been successfully created, change into the project root directory and execute - source /bin/activate +```bash +./run_experiments.sh +``` -To install the `amides` package and the required dependencies, change into the `amides` directory and execute +This will run the `amides-experiments` container that executes the `experiments.sh` script of the `amides` package. The container is configured to use the bind mounts `amides/models` and `amides/plots` for results generated during the experiments, as well as the `data` mount as source for input data used for the experiments. This means that after the container's execution, models and plots generated by the experiments are accessible via the `amides/models` and `amides/plots` directories in the project root folder. The default input data used for model training and validation is taken from the `data` directory. - pip install --upgrade pip - pip install -r requirements.txt - pip install . +To start the quickstart environment for running your own experiments, change into the project root directory and execute +```bash +./start_env.sh +``` -## Running Experiments ## +The script creates and starts the `amides-env` container which is created from the same base image as the `amides-experiments` container. When being started, the `amides-env` container is configured to immediately start a bash inside the container. The shell allows to use and configure the modules and scripts of the `amides` package to run your own experiments. Supporting the same bind mounts as the `amides-results` container, the `amides-env` container enables to build and evaluate models using your own data. +Both containers are run using the `--rm`-flag, which means they will be automatically removed once they finish execution. -## Building and Using the Quickstart Environment ## +Executing `cleanup.sh` from the same location will remove the base image as well as all models and plots placed in the default `amides/plots` and `amides/models` bind mount directories. -As an alternative to installing the `amides` package on your local system, we provide a `Dockerfile` to create an AMIDES quickstart environment where `amides` and all its requirements are already installed. Building and running the quickstart environment requires a Docker installation. +## Running Your Own Experiments -In order to build the `amides:base` image of the quickstart environment, containing `amides` and all its dependencies, execute the +The `amides` package enables to create models for AMIDES from your own datasets. The scripts in `amides/bin` are ready to train, validate, and evaluate models for both the misuse classification and rule attribution components of AMIDES. The current training, validation, and evaluation processes using these scripts are described below in more detail. - build_image.sh +Not all modules and classes of the `amides` package are currently used. However, most of them are still compatible and usable, and some can be configured by configuration parameters. -script located in the project's root folder. This will execute the corresponding `docker build` command. The image is based on the `python:3.11-slim-bookworm` image. If the image is no longer needed at some point, it can be removed by executing the `remove_image.sh` script. +Training, validation, and evaluation allow to specify different configuration parameters that are usually provided as command-line arguments and options. Using the `-h` or `--h` flag on a script reveals the command-line options and arguments that are supported by it. -After the `amides:base` image has been successfully created, executing the +Due to the amount of configuration parameters supported by many scripts, almost all of them support the usage of configuration files. Options and configuration parameters are placed in a `.json` file, where options are specified as keys, and parameters are placed in values. Config files are provided via the `--config` flag. - create_containers.sh +### Creating Misuse Classification Models -script will create two separate containers: `amides-results` and `amides-env`. +The training of misuse classification models is performed using `train.py`. First, the script takes the given benign samples and SIEM rule filters and converts them into feature vectors. The location of benign samples is specified by the `--benign-samples` flag. Currently, benign training data needs to be provided in .txt- or .jsonl-files, containing one sample per line. Prior to vectorization, benign samples are normalized. Data can be provided normalized, or still needs to be normalized. In latter case, the `--normalize` flag has to be set. -The `amides-results` container is specifically created to execute the `results.sh` script described in the [experiments](#running-experiments). The container is configured to mount the `amides/plots`, `amides/models`, and `data` directories of the container as bind mounts onto the local file systems. This means that plots and models generated during the experiment execution will be available in the equally named directories on the local system. +In case your data should be normalized beforehand, you can use the `normalize.py`-script. Samples need to be provided in the same format as for `train.py`. The script applies the normalization currently used by AMIDES, and stores them into output files. Assuming your dataset is located at `data/socbed/process_creation/all`, normalize it by executing -The `amides-env` container provides the actual quickstart environment for AMIDES. Starting the container runs a bash script inside the container, which can then be used to execute several scripts of the `amides` package, including training and validating, plotting results, etc. The `amides-env` container supports the same bind mounts as the `amides-results` container, which means event data placed in the `data` folder are accessible from within the container. Models, results, and plots +```bash +./bin/normalize.py "../data/socbed/process_creation/all" --out-file "../data/socbed/process_creation/train/all_normalized" +``` +Location of the SIEM detection rules and corresponding matches and evasions are defined by `--rules-dir` and `--events-dir`. SIEM rule filters and events are loaded by the `RuleSetDataset`-class of the `amides.sigma` module. The `--malicious-samples-type` flag determines the type of malicious samples used for training. `rule_filters` uses the SIEM rule filters, `matches` takes the actual attack events. + +After normalization, the feature extractor converts all samples into TF-IDF vectors. With the `--vectorization` option, other feature extraction and vectorization methods are available. The vectors are later used to fit the SVM model. The `--search-params` flag determines if the hyper-parameters of the SVM should be optimized, or the SVM should just be fitted on default parameters. Currently, the optimization is performed by the `GridSearchCV` class of `scikit-learn`. `GridSearchCV` exhaustively generates candidates from a grid of parameter values. The currently pre-set range of parameter values has been discovered throughout various experiments performed during the AMIDES development. The cross validation splitting strategy is a Stratified-K-Fold approach. The `--cv` flag determines the number of folds. The score function used to evaluate a parameter setting is specified by `--scoring`. The default score function is 'f1-score', but nearly all score functions of `sklearn.metrics` are compatible. + +After parameters have been established and the model has been fit, an additional output-scaler is created. The `--mcc-scaling` flag determines if the scaler range is determined by the mcc values on the benign training data. The `--mcc-threshold` determines the threshold value that is applied symmetrically to determine the required value range. + +By executing + +```bash +./bin/train.py --benign-samples "../data/socbed/process_creation/train" --events-dir "../data/sigma/events/windows/process_creation" --rules-dir "../data/sigma/rules/windows/process_creation" --type "misuse" --malicious-samples-type "rule_filters" --search-params --cv 5 --mcc-scaling --mcc-threshold 0.5 --result-name "misuse_model" --out-dir "models/process_creation" +``` + +a misuse classification models is trained using the benign command-lines in `../data/socbed/process_creation/train` and the SIEM rule filters in `./data/sigma/events/windows/process_creation`. + +The final model is encapsulated into a `TrainingResult` object, together with the transformed training data vectors, the feature extractor, and the scaler. The object gets pickled into the location specified by the `--out-dir` flag. An additional JSON-File containing basic information on model parameters, etc. is also generated in the same location. + +After training, the model needs to be validated. The `validate.py` script loads model and data from the pickled `TrainingResult` object and calculates decision function values on the specified validation dataset. + +Benign validation is provided in the same way as for model training, using the `--benign-samples` option. The `--malicious-samples-type` flag determines whether malicious samples should be `evasions` or `matches`. + +By executing + +```bash +./bin/validate.py --result-path "models/process_creation/train_rslt_misuse_model.zip" --benign-samples "../data/socbed/process_creation/validation" --events-dir "../data/sigma/events/windows/process_creation" --rules-dir "../data/sigma/rules/windows/process_creation" --malicious-samples-type "evasions" --out-dir "models/process_creation" +``` + +the previously trained model is validated using the benign data in `data/socbed/process_creation/validation` and the evasions located in `data/sigma/events/windows/process_creation`. The final result is bundled into a `ValidationResult` object, which is pickled into the specified output location. + +After the misuse model has been validated, it's classification performance is evaluated. The `evaluate_mcc_scaling.py` script loads the validated model and calculates precision, recall, f1-score, and mcc values for the decision function value range that is determined by a specified mcc threshold value. +The number of evaluation thresholds (or iterations) in the target value range is specified by the `--num-eval-thresholds` flag. By executing + +```bash +./bin/eval_mcc_scaling.py --valid-results "models/process_creation/valid_rslt_misuse_model.zip" --num-eval-thresholds 50 --out-dir "models/process_creation" +``` + +the classification performance of the loaded model is evaluated for 50 evenly spaced threshold values of the dynamically determined threshold interval. The evaluation results are collected by a `BinaryEvaluationResult` object, which is also pickled. + +To visualize the evaluation results, the `plot_pr.py`-script is used to create a precision-recall-thresholds plot: + +```bash +./bin/plot_pr.py --result "models/process_creation/eval_rslt_misuse_model.zip" --type "prt" --out-dir "plots" +``` + +### Performing Tainted Training + +Tainted training is performed in the same way as training misuse classification models. For tainted training, the `--tainted-benign-samples` and `--tainted-seed` options are provided to `train.py`. The `tainted-benign-samples` option takes a value between 0 and 100 and defines the fraction of evasions that are used as benign training samples. In order to re-create tainted training results, the `tainted-seed` parameter can be provided. The seed value fixes the set of evasions that are used for tainting. Executing + +```bash +./bin/train.py --benign-samples "../data/socbed/process_creation/train" --events-dir "../data/sigma/events/windows/process_creation" --rules-dir "../data/sigma/rules/windows/process_creation" --type "misuse" --malicious-samples-type "rule_filters" --tainted-benign-samples 10.0 --tainted-seed 42 --search-params --cv 5 --mcc-scaling --mcc-threshold 0.5 --result-name "misuse_model_tainted" --out-dir "models/process_creation/tainted/10" +``` + +trains and optimizes a misuse classification model using 10% of the evasions as benign samples. The seeding to fix the set of evasions that are used for tainting is 42. + +Tainted share and tainted seed values are held by `TrainingResult` objects. When the model is validated, `validate.py` takes the tainted seed and share values to remove the evasions already used for training. Evaluation of tainted training models is performed by `eval_mcc_scaling.py` the same way as other validation results. + +Visualising precision and recall of the `EvaluationResult` objects of multiple tainted training results can be done with the `plot_multi_tainted.py` script. An optional base result without any tainting can be tainted using the `--base-result` flag + +```bash +./bin/plot_multi_tainted.py --base-result "models/process_creation/valid_rslt_misuse_model.zip" --low-tainted "models/process_creation/tainted/10/eval_rslt_misuse_model_tainted.zip" --out-dir "plots" +``` + +### Creating Rule Attribution Models + +Rule attribution models are also generated using `train.py`. Creating a rule attribution model basically consists of creating a misuse classification model for each of the SIEM rules of the rule dataset that you provide. Only the compilation of datasets used for training is different. + +To build a rule attribution model, the script is started with the `--mode=attribution` option. The process of training rule attribution models can be parallelized. `train.py` supports the `--num-subprocesses` option to specify the number of sub-processes used for training the single rule models. To create a rule attribution model of the benign command-lines and the SIEM rule data in `data/`, execute + +```bash +./bin/train.py --benign-samples "../data/socbed/process_creation/train" --events-dir "../data/sigma/events/windows/process_creation" --rules-dir "../data/sigma/rules/windows/process_creation" --type "attribution" --malicious-samples-type "rule_filters" --search-params --search-method "GridSearch" --mcc-scaling --mcc-threshold 0.5 --result-name "attr_model" --out-dir "models/process_creation" +``` + +The rule models are gathered by a `MultiTrainingResult` object, where each entry is a `TrainingResult` object itself. + +The evaluation of the rule attribution performance is done by the `eval_attr.py` script. For the rule attribution evaluation, a mapping of rules and their corresponding evasions is required. +The mapping can be provided as .json-file by the `--rules-evasions` flag. In this JSON file rule names are should be used as keys, and the corresponding evasions are grouped into a list value. + +```json +{ + "New RUN Key Pointing to Suspicious Folder": [ + "HKLM\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run\\foo\\\\%%windir%%\\Temp\\calc.exe" + ] +} +``` + +Alternatively, the mapping is automatically built from the evasion and rule data specified by `events_dir` and `rules_dir`. Executing + +```bash +./bin/eval_attr.py --multi-result "models/process_creation/multi_train_rslt_attr_model.zip" --events-dir ../data/sigma/events/windows/process_creation --rules-dir "../data/sigma/rules/windows" +``` + +Results of the rule attribution evaluation are encapsulated in `RuleAttributionEvaluationResult` instances, which are also pickled. + +Visualizing the rule attribution evaluation results is performed by the `plot_attr.py` script. The `--plot` option allows to choose between the normal distribution, the cumulative distribution, and a combination of both. To get both attributions into the same plot, choose the `combined` option. + +```bash +./bin/plot_attr.py --eval-result "models/process_creation/rl_attr.zip" --plot "combined" --title "rule_attribution_eval_socbed" --out-dir "plots" +``` + +The generated plot type is the same as in the [rule attribution](#rule-attribution) experiment. + +### Preparing Models for Logprep + +Models for the operational use of AMIDES' misuse classification and rule attribution components need to be combined into a single `.zip` file, which is provided to the Logprep instance. The models for the misuse classification and rule attribution components are bundled using the `combine_models.py` script. The pickled `TrainingResult` (or 'ValidationResult') containing the misuse classification model is specified by the `--single` option, the pickled `MultiTrainingResult` containing models for the rule attribution component is determined with the `--multi` flag. By executing + +```bash +./bin/combine_models.py --single "models/process_creation/valid_rslt_misuse_model.zip" --multi "models/process_creation/multi_train_rslt_attr_model.zip" --out-dir "models/operational" +``` ## Documentation @@ -93,7 +307,6 @@ The corresponding academic research paper will be published in the proceedings o R. Uetz, M. Herzog, L. Hackländer, S. Schwarz, and M. Henze, “You Cannot Escape Me: Detecting Evasions of SIEM Rules in Enterprise Networks,” in *Proceedings of the 33rd USENIX Security Symposium (USENIX Security)*, 2024.[[DOI]()] [[arXiv]()] - ## License The files in this repository are licensed under the GNU General Public License Version 3. See [LICENSE](LICENSE) for details. diff --git a/amides/amides/data.py b/amides/amides/data.py index eeb2072..137a43e 100644 --- a/amides/amides/data.py +++ b/amides/amides/data.py @@ -1,7 +1,8 @@ -"""This module provides classes and functions to hold and prepare data for the classification process.""" +"""This module provides classes and functions to hold and prepare datasets for the training and validation process. +""" +from abc import ABC, abstractmethod import numpy as np -from abc import ABC, abstractmethod from scipy import sparse from amides.utils import get_current_timestamp @@ -69,6 +70,7 @@ def __init__(self, samples, labels, label_names=None, feature_info=None): @property def samples(self): + """Return samples.""" return self._samples @samples.setter @@ -82,6 +84,7 @@ def samples(self, samples): @property def labels(self): + """Return labels.""" return self._labels @labels.setter @@ -98,6 +101,7 @@ def labels(self, labels): @property def label_names(self): + """Return label names.""" return self._label_names @label_names.setter @@ -109,14 +113,17 @@ def label_names(self, label_names): @property def feature_info(self): + """Return feature info string.""" return self._feature_info @property def size(self): + """Return the number of samples in the bunch.""" return self._samples.shape[0] @property def shape(self): + """Return the shape of the samples array.""" return self._samples.shape def add_feature_info(self, info): @@ -390,18 +397,15 @@ def __init__(self): def file_name(self): """Returns file name which is mainly used when data splits should be pickled.""" - pass @abstractmethod def stack_horizontally(self, data_split): """Stack splitted data horizontally.""" - pass @abstractmethod def create_info_dict(self): """Return basic information on data split. Mainly used for integration when objects are being pickled.""" - pass class TrainTestSplit(DataSplit): @@ -431,6 +435,7 @@ def __init__(self, train_data=None, test_data=None, name=None): @property def name(self): + """(Sets) and returns the name of the split.""" if self._name is None: self._build_name_from_data_info() @@ -442,6 +447,7 @@ def name(self, name): @property def train_data(self): + """Return the training data.""" return self._data["train"] @train_data.setter @@ -450,6 +456,7 @@ def train_data(self, data): @property def test_data(self): + """Return the test-data.""" return self._data["test"] @test_data.setter @@ -548,7 +555,7 @@ def _build_name_from_data_info(self): class TrainTestValidSplit(TrainTestSplit): - """TrainTestValibSplit-class to create objects containing data splits + """TrainTestValidSplit-class to create objects containing data splits for training, testing, and validation. Testing or validation data could also be used for other purposes. """ @@ -572,6 +579,7 @@ def __init__(self, train_data=None, test_data=None, valid_data=None, name=None): @property def validation_data(self): + """Returns the validation data.""" return self._data["valid"] @validation_data.setter @@ -644,58 +652,6 @@ def _build_name_from_data_info(self): self._name = f"{self._name}_{info}" -class PrecisionRecallData: - """PrecisionRecallData to represent already calculated precision and recall data""" - - def __init__(self, precision, recall, thresholds=None, name=None): - """Creates PrecisionRecallData instances holding precision and recall data. - - Parameters - ---------- - precision: List or np.array - Precision data. - recall: List or np.array - Recall data. - thresholds: List or np.array - Threshold values that were used to calculate precision and recall data. - name: Optional[str] - Name of the PrecisionRecallData instance (usually used for visualization). - """ - - self._name = name - self._precision = precision - self._recall = recall - self._thresholds = thresholds - - @property - def name(self): - return self._name - - @property - def precision(self): - return self._precision - - @property - def recall(self): - return self._recall - - @property - def thresholds(self): - return self._thresholds - - -class PlotData: - def __init__(self, data, name): - self.data = data - self.name = name - - -class ReliabilityEvaluationData: - def __init__(self, probabilities, labels): - self.probabilities = probabilities - self.labels = labels - - class TrainingResult: """Holds trained estimator instance and the used training data.""" @@ -745,6 +701,7 @@ def __init__( @property def estimator(self): + """Returns the trained model.""" return self._estimator @estimator.setter @@ -753,6 +710,7 @@ def estimator(self, estimator): @property def data(self): + """Returns the training data.""" return self._data @data.setter @@ -761,6 +719,7 @@ def data(self, data): @property def tainted_share(self): + """Returns the fraction of tainting.""" return self._tainted_share @tainted_share.setter @@ -769,6 +728,7 @@ def tainted_share(self, share): @property def tainted_seed(self): + """Returns the seeding used for tainting.""" return self._tainted_seed @tainted_seed.setter @@ -777,10 +737,12 @@ def tainted_seed(self, seed): @property def feature_extractors(self): + """Returns the feature extractor.""" return self._feature_extractors @property def scaler(self): + """Returns the symmetric min-max scaler.""" return self._scaler @scaler.setter @@ -789,10 +751,12 @@ def scaler(self, scaler): @property def timestamp(self): + """Returns the timestamp value.""" return self._timestamp @property def name(self): + """Returns the name of the result.""" if self._name is None: self._build_name_from_result_info() @@ -803,6 +767,13 @@ def name(self, name): self._name = name def add_feature_extractor(self, feat_extractor): + """Add feature extractor to the result. + + Parameters + ---------- + feat_extractor: Vectorizer + The feature extractor to be added. + """ self._feature_extractors.append(feat_extractor) def file_name(self): @@ -828,6 +799,13 @@ def file_name(self): return file_name def create_info_dict(self): + """Creates an info dict containin meta information in human-readable format. + + Returns + ------- + :dict + Dictionary containing meta information. + """ info = { "estimator": self._estimator.__class__.__name__, "estimator_params": self._estimator.get_params(), @@ -911,6 +889,7 @@ def __init__( @property def predict(self): + """Returns the decision function values.""" return self._predict def file_name(self): @@ -955,6 +934,7 @@ def __init__(self, name=None, timestamp=None, benign_training_data=None): @property def name(self): + """Return the name of the result.""" if self._name is None: self._name = "multi_train_rslt" @@ -966,6 +946,7 @@ def name(self, name): @property def timestamp(self): + """Return the timestamp value.""" return self._timestamp @timestamp.setter @@ -974,10 +955,12 @@ def timestamp(self, timestamp): @property def results(self): + """Return the results dictionary.""" return self._results @property def benign_train_data(self): + """Return the common benign training data.""" return self._benign_train_data @benign_train_data.setter @@ -1007,7 +990,14 @@ def get_result(self, result_name): return result def file_name(self): - if self.name.startswith("multi_train_result"): + """Build a file name starting with 'multi_train_rslt' + + Returns + ------- + :str + The file name starting with 'multi_train_rslt' + """ + if self.name.startswith("multi_train_rslt"): file_name = self.name else: file_name = f"multi_train_rslt_{self.name}" @@ -1018,6 +1008,13 @@ def file_name(self): return file_name def create_info_dict(self): + """Creates an info dict containing meta information in human-readable format. + + Returns + ------- + :dict + Dictionary containing meta information. + """ results_info = {} for key, result in self._results.items(): @@ -1072,6 +1069,7 @@ def name(self): @property def benign_valid_data(self): + """Returns common benign validation data.""" return self._benign_valid_data @benign_valid_data.setter diff --git a/amides/amides/evaluation.py b/amides/amides/evaluation.py index abc63c4..6fa07b7 100644 --- a/amides/amides/evaluation.py +++ b/amides/amides/evaluation.py @@ -1,3 +1,4 @@ +"""This module holds classes used for model evaluation.""" import math import numpy as np @@ -39,10 +40,24 @@ def __init__(self, thresholds=None, name=None, timestamp=None): @property def thresholds(self): + """Returns array of threshold values. + + Returns + ------- + :np.ndarray + Array of threshold values. + """ return self._thresholds @property def precision(self): + """Returns array of precision values. + + Returns + ------- + :np.ndarray + Array of threshold values. + """ return self._precision @precision.setter @@ -51,6 +66,13 @@ def precision(self, precision): @property def recall(self): + """Returns array of recall values. + + Returns + ------- + :np.ndarray + Array of recall values. + """ return self._recall @recall.setter @@ -59,10 +81,24 @@ def recall(self, recall): @property def f1_scores(self): + """Returns array of f1-score values. + + Returns + ------- + :np.ndarray + Array of f1-score values. + """ return self._f1_scores @property def mccs(self): + """Returns array of mcc values. + + Returns + ------- + :np.ndarray + Array of mcc values. + """ return self._mccs @property @@ -87,9 +123,17 @@ def evaluate(self, labels, predict): self._calculate_no_skill(labels) def optimal_threshold_index(self): + """Returns the optimal threshold index of the maximum f1-score value. + + Returns + ------- + :int + Index of the maximum f1-score. + """ return np.argmax(self._f1_scores) def file_name(self): + """Build and return the file name starting with 'eval_rslt_'""" file_name = ( self.name if self.name.startswith("eval_rslt") else f"eval_rslt_{self.name}" ) @@ -100,6 +144,7 @@ def file_name(self): return file_name def create_info_dict(self): + """Creates and returns dict containing meta information in human-readable format.""" optimal_index = self.optimal_threshold_index() default_index = self.default_threshold_index() info = { @@ -137,6 +182,7 @@ def create_info_dict(self): return info def default_threshold_index(self): + """Returns the default threshold index.""" default_idx = np.where(self._thresholds == 0.5) if len(default_idx[0]) > 0: @@ -166,7 +212,6 @@ def _init_result_arrays(self, thresholds): def _evaluate_with_given_thresholds(self, labels, predict): for i, threshold in enumerate(self._thresholds): - # print(f"Iteration: {i}") new_predict = np.where(predict >= threshold, 1, 0) self._precision[i] = precision_score( y_true=labels, y_pred=new_predict, zero_division=1 @@ -198,274 +243,9 @@ def _calculate_no_skill(self, labels): self._no_skill = len(labels[labels == 1]) / len(labels) -class RawBinaryEvaluationResult: - """RawBinaryEvaluationResult to evaluate classification result against custom - threshold values.""" - - def __init__(self, thresholds, name=None, timestamp=None): - """Create objects. - - Parameters - --------- - thresholds: iterable - Iterable of threshold values. Values are used for classifer evaluation. - name: Optional[str] - Name of the evaluation result (Optional as it is mainly used for pickling) - timestamp: Optional[str] - Timestamp when result was created (Optional) - """ - super().__init__(name, timestamp) - self._thresholds = thresholds - self._num_thresholds = len(thresholds) - - self._tp = np.zeros(shape=(self._num_thresholds,)) - self._fp = np.zeros(shape=(self._num_thresholds,)) - self._tn = np.zeros(shape=(self._num_thresholds,)) - self._fn = np.zeros(shape=(self._num_thresholds,)) - - @property - def name(self): - if self._name is None: - self._build_name_from_params() - - return self._name - - @property - def tp(self): - return self._tp - - @property - def fp(self): - return self._fp - - @property - def tn(self): - return self._tn - - @property - def fn(self): - return self._fn - - @property - def thresholds(self): - return self._thresholds - - def file_name(self): - file_name = ( - self.name - if self.name.startswith("th_eval_rslt") - else f"th_eval_rslt_{self.name}" - ) - - return f"{file_name}_{self._timestamp}" - - def create_info_dict(self): - optimal_index = self.optimal_threshold_index() - precision = self._calculate_precision() - recall = self._calculate_recall() - f1_score = self._calculate_f_score() - mcc = self._calculate_mcc() - - info = { - "name": self.name, - "timestamp": self._timestamp, - "thresholds": { - "num_thresholds": self._num_thresholds, - "min_threshold_value": np.amin(self._thresholds), - "max_threshold_value": np.amax(self._thresholds), - }, - "max": { - "f1_score": np.amax(f1_score), - "precision": np.amax(precision), - "recall": np.amax(recall), - "mcc": np.amax(mcc), - }, - "optimum": { - "threshold": self._thresholds[optimal_index], - "f1_score": f1_score[optimal_index], - "precision": precision[optimal_index], - "recall": recall[optimal_index], - "mcc": mcc[optimal_index], - }, - } - - return info - - def calculate_precision(self): - precision = np.zeros(shape=(self._num_thresholds,)) - - for i in range(0, self._num_thresholds): - if self._tp[i] + self._fp[i] == 0: - precision[i] = 1.0 - else: - precision[i] = self._tp[i] / (self._tp[i] + self._fp[i]) - - return precision - - def calculate_recall(self): - recall = np.zeros(shape=(self._num_thresholds,)) - - for i in range(0, self._num_thresholds): - if self._tp[i] + self._fn[i] == 0: - recall[i] = 1.0 - else: - recall[i] = self._tp[i] / (self._tp[i] + self._fn[i]) - - return recall - - def calculate_f_score(self): - precision = self.calculate_precision() - recall = self.calculate_recall() - f1_score = np.zeros(shape=(self._num_thresholds,)) - beta_squared = pow(1.0, 2) - - for i in range(0, self._num_thresholds): - f1_score[i] = (1 + beta_squared) * ( - (precision[i] * recall[i]) / (beta_squared * precision[i] + recall[i]) - ) - - return f1_score - - def _calculate_mcc(self): - mcc = np.zeros(shape=(self._num_thresholds,)) - - for i in range(0, self._num_thresholds): - mcc[i] = ( - self._tp[i] * self._tn[i] - self._fp[i] * self._fn[i] - ) / math.sqrt( - ( - (self._tp[i] + self._fp[i]) - * (self._tp[i] + self._fn[i]) - * (self._tn[i] + self._fp[i]) - * (self._tn[i] + self._fn[i]) - ) - ) - - return mcc - - def _calculate_no_skill(self): - no_skill = (self._tp[0] + self._fn[0]) / ( - self._tp[0] + self._fp[0] + self._tn[0] + self._fn[0] - ) - - return no_skill - - def evaluate(self, labels, predict): - for idx, threshold in enumerate(self._thresholds): - adapted_predict = self._predict(threshold, predict) - self._update_data(idx, adapted_predict, labels) - - def _predict(self, threshold, predict): - return (predict[:] >= threshold).astype("int") - - def _update_data(self, threshold_idx, predict, labels): - for i in range(0, len(predict)): - if self._is_true_positive(labels[i], predict[i]): - self._tp[threshold_idx] += 1 - elif self._is_true_negative(labels[i], predict[i]): - self._tn[threshold_idx] += 1 - elif self._is_false_positive(labels[i], predict[i]): - self._fp[threshold_idx] += 1 - elif self._is_false_negative(labels[i], predict[i]): - self._fn[threshold_idx] += 1 - - def _build_name_from_params(self): - self._name = f"th_eval_rslt_{self._num_thresholds}" - - def _is_false_negative(self, label, predict): - return label == 1 and predict == 0 - - def _is_true_positive(self, label, predict): - return label == 1 and predict == 1 - - def _is_false_positive(self, label, predict): - return label == 0 and predict == 1 - - def _is_true_negative(self, label, predict): - return label == 0 and predict == 0 - - -class NegativeStateEvaluationResult(RawBinaryEvaluationResult): - def __init__(self, origin_labels, thresholds, name=None, timestamp=None): - super().__init__(thresholds, name, timestamp) - self._init_negative_sample_state(origin_labels) - self._num_positive_samples = 0 - - def _init_negative_sample_state(self, origin_labels): - negative_samples_end = self._get_negative_samples_end(origin_labels) - self._negative_samples_state = np.zeros( - shape=( - self._num_thresholds, - negative_samples_end, - ), - dtype="int64", - ) - - @property - def negative_sample_state(self): - return self._negative_samples_state - - def file_name(self): - if self.name.startswith("state_eval_rslt"): - file_name = self.name - else: - file_name = f"state_eval_rslt_{self.name}" - - return f"{file_name}_{self._timestamp}" - - def _calculate_no_skill(self): - try: - num_total_samples = ( - len(self._negative_samples_state[0]) + self._num_positive_samples - ) - except IndexError: - num_total_samples = self._num_positive_samples - - self._no_skill = self._num_positive_samples / num_total_samples - - def evaluate(self, predict, labels): - self._update_num_positive_samples(labels) - negative_samples_end = self._get_negative_samples_end(labels) - - for idx, threshold in enumerate(self._thresholds): - adapted_predict = self._predict(threshold, predict) - overall_predict = self._get_negative_samples_status( - adapted_predict, negative_samples_end, idx - ) - self._update_data(idx, overall_predict, labels) - self._update_negative_samples_state( - overall_predict, labels, negative_samples_end, idx - ) - - def _get_negative_samples_end(self, labels): - return len([label for label in labels if label == 0]) - - def _update_num_positive_samples(self, labels): - self._num_positive_samples = +len(labels[labels == 1]) - - def _get_negative_samples_status(self, predict, negative_samples_end, idx): - predict[:negative_samples_end] = ( - self._negative_samples_state[idx] | predict[:negative_samples_end] - ).astype(int) - - return predict - - def _update_negative_samples_state( - self, predict, labels, negative_samples_end, idx - ): - self._negative_samples_state[idx] = ( - self._negative_samples_state[idx] - | ( - (labels[:negative_samples_end] == 0) - & (predict[:negative_samples_end] == 1) - ) - ).astype(int) - - def _build_name_from_params(self): - self._name = f"multi_eval_rslt_{self._metric}_{self._num_thresholds}" - - class RuleAttributionEvaluationResult: + """This class evaluates the""" + def __init__(self, num_rules=130, name=None, timestamp=None): self._name = name self._timestamp = ( @@ -486,6 +266,7 @@ def __init__(self, num_rules=130, name=None, timestamp=None): @property def name(self): + """Return the name of the result.""" if self._name is None: self._name = self._build_name_from_params() @@ -493,37 +274,46 @@ def name(self): @property def tp(self): + """Returns the number of true positives (tp).""" return self._tp @property def fp(self): + """Returns the number of false positives (tp).""" return self._fp @property def tn(self): + """Returns the number of true negatives (tn).""" return self._tn @property def fn(self): + """Returns the number of false negatives (fn).""" return self._fn @property def num_total_samples(self): + """Returns the total number of samples seen.""" return self._num_total_samples @property def top_n_hits(self): + """Returns the top-n hits.""" return self._top_n_hits @property def top_n_hit_rates(self): + """Returns the top-n hit rates.""" return self._top_n_hit_rates @property def misses(self): + """Returns the number of misses.""" return self._misses def file_name(self): + """Creates a file name for the result.""" file_name = ( self.name if self.name.startswith("eval_rl_attr") @@ -536,6 +326,7 @@ def file_name(self): return file_name def create_info_dict(self): + """Crates info dict containing meta information.""" info = { "name": self._name, "timestamp": self._timestamp, @@ -561,6 +352,7 @@ def create_info_dict(self): return info def calculate_precision(self): + """Calculates and returns the precision array.""" try: precision = self._tp / (self._tp + self._fp) except ZeroDivisionError: @@ -569,6 +361,7 @@ def calculate_precision(self): return precision def calculate_recall(self): + """Calculates and returns the recall array.""" try: recall = self._tp / (self._tp + self._fn) except ZeroDivisionError: @@ -577,9 +370,11 @@ def calculate_recall(self): return recall def calculate_no_skill(self): + """Calculates and returns the no-skill values.""" return (self._tp + self._fn) / (self._tp + self._fp + self._tn + self._fn) def calculate_f_score(self, beta=1): + """Calculates and returns the f1-score values.""" precision = self.calculate_precision() recall = self.calculate_recall() beta_squared = pow(beta, 2) @@ -591,12 +386,16 @@ def calculate_f_score(self, beta=1): return f_score def calculate_top_n_hit_rates(self): + """Calculate the top-n-hit-rates.""" self._top_n_hit_rates = self._top_n_hits / sum(self._top_n_hits) def calculate_miss_rate(self): + """Calculate the miss rate.""" return self._calculate_rate(self._misses, self._tp) def evaluate_rule_attributions(self, rule_name, rule_attributions): + """Evaluate the given rule attribution by checking at which position + the correct rule name appears in the ranked list of potentially evaded rules.""" self._num_total_samples += 1 if self._is_true_positive(rule_name, rule_attributions): @@ -657,10 +456,3 @@ def _build_name_from_params(self): return f"eval_rl_attr_{self._timestamp}" return "eval_rl_attr" - - -def calculate_hyperplane_distances(svc, data): - if isinstance(data, sparse.csr_matrix): - return svc.decision_function(data) / sparse.linalg.norm(svc.coef_.copy()) - else: - return svc.decision_function(data) / np.linalg.norm(svc.coef_.copy()) diff --git a/amides/amides/events.py b/amides/amides/events.py index 856583f..41c088f 100644 --- a/amides/amides/events.py +++ b/amides/amides/events.py @@ -1,3 +1,4 @@ +"""This module contains classes and functions that help to load and organize events.""" import random from enum import Enum, auto @@ -94,7 +95,7 @@ def size(self): @staticmethod def get_event_type_from_dir_name(dir_name): - return Events.dir_name_event_type_map[dir_name] + return Events.event_name_type_map[dir_name] def add_event(self, event): """ @@ -254,20 +255,12 @@ def _load_events_from_jsonl_file(self, events_file): _logger.debug("Loading events from %s", events_file) events = read_jsonl_file(events_file) for event in events: - try: - self._add_event(event) - except EventsError as err: - _logger.error(err) - continue + self._add_event(event) def _add_event(self, event): if self._is_required_type_of_event(event): self._data.append(event) _logger.debug("Adding event %s to %s events", event, self._type) - else: - raise EventsError( - f"Event type does not match required event type {self._type}" - ) def _is_json_file(self, event_file): return event_file.endswith(".json") @@ -276,7 +269,7 @@ def _is_jsonl_file(self, event_file): return event_file.endswith(".jsonl") def _is_required_type_of_event(self, event): - return True + return True if event is not None else False def _create_random_split(self, split_sizes, seed=None): if seed is not None: @@ -318,6 +311,7 @@ def __init__(self): @property def events(self): + """Returns the dictionary containing events.""" return self._events def add_events(self, events): @@ -378,5 +372,5 @@ def get_events_by_type(self, event_type): return events_list -# Global events cache to hold once loaded benign event data benign_events_cache = EventsCache() +"""Global events cache to hold once loaded benign event data""" diff --git a/amides/amides/features/deduplicate.py b/amides/amides/features/deduplicate.py index 82077dd..e622d06 100644 --- a/amides/amides/features/deduplicate.py +++ b/amides/amides/features/deduplicate.py @@ -1,27 +1,13 @@ +""" This module contains functions and classes that help to deduplicate samples. +""" import json from amides.features.normalize import Normalizer -def deduplicate_samples(samples_path: str, normalizer: Normalizer) -> "Cache": - cache = Cache() - - for sample in open(samples_path, "r"): - try: - loaded = json.loads(sample.rstrip("\n")) - except json.JSONDecodeError: - continue - - normalized = normalizer.normalize(loaded) - if not normalized: - continue - - cache.insert(normalized) - - return cache - - class Cache: - """Simple cache to hold already seen samples and count their number of occurrences.""" + """Simple cache based on a dict to hold already seen strings and + count their total number of occurrences. + """ __slots__ = ("_elements",) @@ -30,14 +16,69 @@ def __init__(self): @property def elements(self) -> dict: + """Returns elements held by the cache.""" return self._elements @property def samples(self) -> list[str]: + """Returns the unique samples.""" return list(self._elements.keys()) def insert(self, element: str): + """Insert element into the cache. Increases count + if value has been seen before. + + Parameters + ---------- + element :str + String value that should be insterted + """ self._elements[element] = self._elements.get(element, 0) + 1 def get(self, element: str) -> int: + """Return specific element from the cache in case this element + has been seen before. + + Parameters + ---------- + element: str + String value + + Returns + ------- + : str + """ return self._elements.get(element, "") + + +def deduplicate_samples(samples_path: str, normalizer: Normalizer) -> Cache: + """Deduplicates list of samples in the specified file using Cache. Samples + are normalized first using the provided Normalizer-instance. + + Parameters + ---------- + samples_path :str + Path of the file containing samples + normalizer :Normalizer + Normalizer instance to normalize samples prior to deduplication + + Returns + ------- + cache :Cache + Cache object holding the deduplicated samples + """ + cache = Cache() + + for sample in open(samples_path, "r", encoding="utf-8"): + try: + loaded = json.loads(sample.rstrip("\n")) + except json.JSONDecodeError: + continue + + normalized = normalizer.normalize(loaded) + if not normalized: + continue + + cache.insert(normalized) + + return cache diff --git a/amides/amides/features/extraction.py b/amides/amides/features/extraction.py index 4b99a4c..e8c7274 100644 --- a/amides/amides/features/extraction.py +++ b/amides/amides/features/extraction.py @@ -1,4 +1,5 @@ -"""This module contains functions and classes that are used for feature extraction.""" +"""This module contains functions and classes that are used for feature extraction from +data bunches.""" import numpy as np @@ -8,10 +9,7 @@ from sklearn.preprocessing import FunctionTransformer from amides.data import DataBunch -from amides.utils import get_current_timestamp, get_logger -from amides.features.tokenization import AnyWordCharacter, CommaSeparation -from amides.features.filter import NumericValues, Strings -from amides.features.preprocessing import FilterDummyCharacters, Lowercase +from amides.utils import get_logger _logger = get_logger(__name__) @@ -22,10 +20,12 @@ class TextFeatureExtractor(ABC): @property @abstractmethod def name(self): + """Returns the name of the extractor.""" return @abstractmethod def file_name(self): + """Returns the file name of the extractor.""" return def extract(self, train_data, test_data=None, valid_data=None): @@ -149,6 +149,7 @@ def name(self): @property def vectorizer(self): + """Return the underlying vectorizer.""" return self._vectorizer def file_name(self): @@ -193,12 +194,14 @@ def transform(self, samples): return transformed_samples def get_feature_names(self): + """Return feature names (vocabulary) learned by the extractor.""" return self._vectorizer.get_feature_names_out() class TfidfExtractor(TextFeatureExtractor): - """Convert text data into n-dimensional darray of - Term Frequency-Inverse Document Frequency (TF-IDF) vectors.""" + """Convert data bunches holding text data into n-dimensional darray of + Term Frequency-Inverse Document Frequency (TF-IDF) vectors. + """ def __init__( self, @@ -255,6 +258,7 @@ def name(self): @property def vectorizer(self): + """Return the underlying vectorizer.""" return self._vectorizer def file_name(self): @@ -298,6 +302,7 @@ def transform(self, samples): return transformed_samples def get_feature_names(self): + """Return feature names (vocabulary) learned by the extractor.""" return self._vectorizer.get_feature_names_out() @@ -664,6 +669,17 @@ def extract_process_args(events): @staticmethod def extract_process_args_from_event(event): + """Extract field values from 'process.args' from dictionary. + + Parameters + ---------- + event :dict + Event whose values should be extracted + + Returns + ------- + args: Optional[str] + """ try: return event["process"]["args"] except KeyError: @@ -789,6 +805,17 @@ def extract_commandline(events): @staticmethod def extract_commandline_from_event(event): + """Extract process command-line from winlog or SOCBED event. + + Parameters + ---------- + event :dict + Event whose values should be extracted + + Returns + ------- + args: Optional[str] + """ proc_cmdline = CommandlineExtractor._extract_commandline_from_winlog(event) if proc_cmdline is None: proc_cmdline = CommandlineExtractor._extract_commandline(event) @@ -810,7 +837,7 @@ def _extract_commandline(event): return None def _adjust_data_labels_mismatch(self, data_bunch, transformed_data): - none_indices = np.where(transformed_data == None)[0] + none_indices = np.where(transformed_data is None)[0] adjusted_data = np.delete(transformed_data, none_indices, axis=0) adjusted_labels = np.delete(data_bunch.labels, none_indices, axis=0) @@ -824,7 +851,6 @@ def _adjust_data_labels_mismatch(self, data_bunch, transformed_data): return adjusted_bunch - def _is_valid_str_sequence_array(seq_iter): """Checks if list or np.ndarray of strings is provided. @@ -836,8 +862,6 @@ def _is_valid_str_sequence_array(seq_iter): Returns ------- result: Boolean - True/False. - """ if isinstance(seq_iter, (list, np.ndarray)): return all(isinstance(sequence, str) for sequence in seq_iter) diff --git a/amides/amides/features/filter.py b/amides/amides/features/filter.py index f5a133e..06c7281 100644 --- a/amides/amides/features/filter.py +++ b/amides/amides/features/filter.py @@ -1,8 +1,14 @@ +"""This module contains token elimination classes used to eliminate tokens showing specific +patterns from a list of tokens. +""" + import re from abc import ABC, abstractmethod class TokenEliminator(ABC): + """Base class for all token elimination classes.""" + @abstractmethod def __call__(self, token_list): pass @@ -10,11 +16,21 @@ def __call__(self, token_list): @property @abstractmethod def name(self): - pass + """Return the name of the eliminator.""" class NumericValues(TokenEliminator): + """NumericValues eliminates hex and decimal values whose number of characters/digits exceeds + a maximum length value.""" + def __init__(self, length): + """Create instances. + + Parameter + -------- + length: int + Maximum character length of hex/decimal values. + """ super().__init__() self._re = r"^(?:0x)?[0-9a-f]{{{0},}}$".format(length + 1) @@ -29,6 +45,8 @@ def name(self): class Strings(TokenEliminator): + """Eliminates strings that exceed a certain length.""" + def __init__(self, length): super().__init__() self._length = length diff --git a/amides/amides/features/normalize.py b/amides/amides/features/normalize.py index d2ae68d..b17c782 100644 --- a/amides/amides/features/normalize.py +++ b/amides/amides/features/normalize.py @@ -1,3 +1,5 @@ +"""This module normalizes samples by performing certain preprocessing, tokenization, and toke elimination steps.""" + from typing import List from amides.features.preprocessing import FilterDummyCharacters, Lowercase @@ -44,6 +46,19 @@ def __init__(self, max_len_num_values=3, max_len_strings=30): self._strings = Strings(length=max_len_strings) def normalize(self, sample: str) -> str: + """Normalize a single string. + + Parameters + --------- + sample: str + Sample which should be normalized. + + Returns + ------- + normalized :str + String with comma-separated list of remaining tokens. + + """ preprocessed = self._preprocess(sample) tokens = self._tokenize(preprocessed) shrinked_tokens = self._eliminate_tokens(tokens) diff --git a/amides/amides/features/preprocessing.py b/amides/amides/features/preprocessing.py index 29bc42b..48bdd19 100644 --- a/amides/amides/features/preprocessing.py +++ b/amides/amides/features/preprocessing.py @@ -1,8 +1,11 @@ +"""This module contains classes used for preprocessing during normalization.""" import re from abc import ABC, abstractmethod class Preprocessor(ABC): + """Base class for all Preprocessing-classes.""" + @abstractmethod def __call__(self, string): pass @@ -10,10 +13,12 @@ def __call__(self, string): @property @abstractmethod def name(self): - pass + """Return name of the preprocessor.""" class FilterDummyCharacters(Preprocessor): + """FilterDummyCharacter removes all command-line dummy characters (",^,`).""" + def __init__(self): super().__init__() self._re = r"[\"\^`’]" @@ -27,6 +32,8 @@ def name(self): class Lowercase(Preprocessor): + """Turns all samples into lowercase.""" + def __call__(self, string): return string.lower() diff --git a/amides/amides/features/tokenization.py b/amides/amides/features/tokenization.py index 9a0a235..3e45802 100644 --- a/amides/amides/features/tokenization.py +++ b/amides/amides/features/tokenization.py @@ -1,8 +1,11 @@ +"""This module contains classes used to turn samples into lists of tokens.""" import re from abc import ABC, abstractmethod class Tokenizer(ABC): + """Base class for all tokenization-classes.""" + @abstractmethod def __call__(self, string): pass @@ -10,7 +13,7 @@ def __call__(self, string): @property @abstractmethod def name(self): - pass + """Returns unique name tag of the tokenizer-class.""" class Split(Tokenizer): @@ -42,7 +45,7 @@ def name(self): class WhitespaceAsteriskSlashMinus(Tokenizer): - """WhitespaceAsteriskSlashMinus-Tokenizer to be used as tokenizer. Splits strings + """WhitespaceAsteriskSlashMinus-Tokenizer splits strings on whitespace, asterisk(*), slashes (/\), and minus (-) symbols. """ @@ -59,6 +62,10 @@ def name(self): class WhitespaceAsteriskSlashMinusEquals(Tokenizer): + """WhitespaceAsteriskSlashMinusEquals-Tokenizer splits strings + on whitespace, asterisk(*), slashes (/\), minus (-), and equals (=) symbols. + """ + def __init__(self): super().__init__() self._re = r"([^\s\\/\*=-]+)" @@ -72,6 +79,9 @@ def name(self): class AnyWordCharacter(Tokenizer): + """Split string samples on the occurrence of any-word character, + i.e.[a-zA-Z_0-9].""" + def __init__(self): super().__init__() self._re = r"(\w+)" @@ -85,6 +95,8 @@ def name(self): class CommaSeparation(Tokenizer): + """Split samples on any comma value.""" + def __call__(self, string): return string.split(",") @@ -94,6 +106,7 @@ def name(self): class TokenizerFactory: + """TokenizerFactory to create Tokenizer-Objects using their unique name tags.""" _tokenizers = { "split": Split, @@ -106,4 +119,15 @@ class TokenizerFactory: @classmethod def create(cls, name): + """Create a Tokenizer-Instance using its unique name tag. + + Parameters + ---------- + name: str + The unique name of the Tokenizer-class. + + Returns + ------ + : Tokenizer + """ return cls._tokenizers[name]() diff --git a/amides/amides/models/selection.py b/amides/amides/models/selection.py index fd3f048..91ee158 100644 --- a/amides/amides/models/selection.py +++ b/amides/amides/models/selection.py @@ -1,4 +1,4 @@ -"""This module contains functions and classes that aim to help with model selection.""" +"""This module contains functions and classes that aim to help with candidate selection.""" from sklearn.model_selection import GridSearchCV, ParameterGrid from sklearn.utils.validation import check_is_fitted @@ -28,7 +28,7 @@ def __init__( Parameters ---------- estimator: sklearn.base.BaseEstimator - The estimator which should be optimised. + The estimator which should be optimized. param_grid: dict The parameter grid used as search space. search_method: sklearn.model_selection.BaseSearchCV @@ -51,16 +51,20 @@ def __init__( @property def best_parameters(self): + """Returns the best (final) model parameters.""" check_is_fitted(self._search_method) return self._search_method.best_params_ @property def best_estimator(self): + """Returns the estimator already fitted on the given data and + the best parameters.""" check_is_fitted(self._search_method) return self._search_method.best_estimator_ @property def best_score(self): + """Returns the best score value of all tested candidates.""" check_is_fitted(self._search_method) return self._search_method.best_score_ @@ -122,7 +126,7 @@ def predict(self, data): _logger.debug( "Calculating prediction on: estimator=%s best_parameters=%s", self._search_method.best_estimator_.__class__.__name__, - self._search_method.best_parameters_, + self._search_method.best_params_, ) return self._search_method.best_estimator_.predict(data.samples) @@ -150,8 +154,7 @@ def search_and_predict(self, train_test_split): """ if not isinstance(train_test_split, TrainTestSplit): raise ValueError( - "data is of type {0}, but not of required" - "type TrainTestSplit".format(type(train_test_split)) + f"data is of type {type(train_test_split)}, but not of required type TrainTestSplit" ) self.search_best_parameters(train_test_split.train_data) @@ -187,17 +190,32 @@ def __init__( @property def estimator(self): + """Returns the estimator.""" return self._estimator @property def cv(self): + """Returns the CV instance.""" return self._cv @property def param_grid(self): + """Return the parameter space used for candidate search.""" return self._param_grid def fit(self, samples, labels): + """Search the parameter grid for the best candiate using + the given samples. + + Parameters + ---------- + samples :np.ndarray + Array of samples + + labels :np.ndarray + Array of labels + + """ for params in self._param_grid: self._estimator.set_params(**params) self._estimator.fit(samples, labels) diff --git a/amides/amides/persist.py b/amides/amides/persist.py index aba29ae..4149752 100644 --- a/amides/amides/persist.py +++ b/amides/amides/persist.py @@ -1,14 +1,9 @@ -"""This module contains the functionality to persist and load intermediate -and final classification results. - -This includes training and test data splits, -training results (optmimization results), and final validation results. - +"""This module contains the functionality to save and load training and +validation results. """ - +import os import json import ndjson -import os from zipfile import ZipFile, ZIP_DEFLATED from joblib import dump, load @@ -20,7 +15,6 @@ from amides.evaluation import ( BinaryEvaluationResult, RuleAttributionEvaluationResult, - NegativeStateEvaluationResult, ) from amides.utils import get_logger @@ -62,6 +56,17 @@ def _create_output_directory(self, output_path): self._output_path = output_path def save_object(self, obj, file_name=None): + """Save given object. + + Parameters + ---------- + obj: object + Object which should be pickled. + + file_name: Optional[str] + Name of the output file. + + """ if self._is_known_object(obj): self._save_known_object(obj, file_name) else: @@ -176,7 +181,6 @@ def _is_known_object(self, obj): MultiTrainingResult, BinaryEvaluationResult, RuleAttributionEvaluationResult, - NegativeStateEvaluationResult, dict, ), ): @@ -217,19 +221,40 @@ def __init__(self, out_dir: str, start: str, end: str): @staticmethod def create_filename(start_iso: str, end_iso: str): + """Create file name for the events that have been written to file.""" start = start_iso.replace(":", "").replace("-", "") end = end_iso.replace(":", "").replace("-", "") return f"events_{start}_{end}" - def write(self, hits: list[dict], batch_size: int): + def write(self, hits: list[dict]): + """Write events to file in batches. If file exists, events are appended. + + Parameters + ---------- + hist: List[Dict] + List of events that should be written to file. + """ with self._output_path.open("a+", encoding="utf-8") as out_file: self._write_batch(hits, out_file) def get_last_file(self) -> Optional[Path]: + """Returns the path of the last output file. + + Returns + ------- + :Optional[Path] + """ return self._output_path if self._output_path.is_file() else None def read_last_file(self) -> set[str]: + """Read events already written to the last file. + + Returns + ------- + :set[str] + Set of unique events written to the last file. + """ last_file = self.get_last_file() if last_file: with last_file.open("r", encoding="utf-8") as last_file: @@ -255,7 +280,14 @@ def __init__( self._compression = compression self._archive_path = self._output_path.parent / f"{self._output_path.name}.zip" - def write(self, hits: list[dict], batch_size: int): + def write(self, hits: list[dict]): + """Write list of events to .zip-file. + + Parameters + ---------- + hits: list[dict] + List of dictionaries. + """ if self._archive_path.is_file(): events = self._get_events() events.extend(hits) diff --git a/amides/amides/scale.py b/amides/amides/scale.py index bb3a0b4..a6cd986 100644 --- a/amides/amides/scale.py +++ b/amides/amides/scale.py @@ -1,9 +1,24 @@ +"""This module contains functions to create symmetric output value scalers.""" import numpy as np from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import matthews_corrcoef def create_min_max_scaler(df_min: float, df_max: float): + """Create symmetric min-max scaler. + + Parameters + ---------- + df_min: float + The minimum df-value + + df_max: float + The maximum df_value + + Returns + ------- + : MinMaxScaler + """ scaler = MinMaxScaler() scaler.data_min_ = df_min scaler.data_max_ = df_max @@ -18,20 +33,52 @@ def create_min_max_scaler(df_min: float, df_max: float): return scaler -def calculate_shifting_value(mcc: np.ndarray, df_iter_values: np.ndarray): - mcc_optimum_idx = np.argmax(mcc) - optimum_df_value = df_iter_values[mcc_optimum_idx] +def create_symmetric_mcc_min_max_scaler( + df_values: np.ndarray, + labels: np.ndarray, + num_mcc_samples: int, + mcc_threshold: float, +): + """Creates a symmetric min-max scaler using mcc threshold optimization.""" + df_min, df_max = df_values.min(), df_values.max() + iter_values = _calculate_iter_values(df_min, df_max, num_mcc_samples) + mcc = _calculate_mcc_values(df_values, labels, iter_values) + + target_df_values = _calculate_target_df_values(mcc, iter_values, mcc_threshold) + target_df_min, target_df_max = _calculate_symmetric_min_max_df_values( + target_df_values + ) + + # Repeat process of MCC optimization after target df-value range was calculated + # in order to increase precision + target_iter_values = _calculate_iter_values( + target_df_min, target_df_max, num_mcc_samples + ) + target_mcc = _calculate_mcc_values(df_values, labels, target_iter_values) + target_df_values = _calculate_target_df_values( + target_mcc, target_iter_values, mcc_threshold + ) + target_df_min, target_df_max = _calculate_symmetric_min_max_df_values( + target_df_values + ) + + return create_min_max_scaler(target_df_min, target_df_max) + - return optimum_df_value +def create_symmetric_min_max_scaler(df_values: np.ndarray): + """Create symmetric min-max scaler.""" + df_min, df_max = _calculate_symmetric_min_max_df_values(df_values) + + return create_min_max_scaler(df_min, df_max) -def calculate_iter_values(min_value: float, max_value: float, num_iter_values: int): +def _calculate_iter_values(min_value: float, max_value: float, num_iter_values: int): iter_step = (max_value - min_value) / num_iter_values return np.arange(min_value, max_value + iter_step, iter_step) -def calculate_target_df_values( +def _calculate_target_df_values( mcc: np.ndarray, df_iter_values: np.ndarray, mcc_threshold: float ): target_idcs = np.where(mcc > mcc_threshold)[0] @@ -39,7 +86,7 @@ def calculate_target_df_values( return df_iter_values[target_idcs] -def calculate_mcc_values(df_values: np.ndarray, labels: np.ndarray, iter_values: int): +def _calculate_mcc_values(df_values: np.ndarray, labels: np.ndarray, iter_values: int): mcc = np.zeros(shape=(iter_values.size,)) for i, threshold in enumerate(iter_values): predict = np.where(df_values >= threshold, 1, 0) @@ -48,7 +95,7 @@ def calculate_mcc_values(df_values: np.ndarray, labels: np.ndarray, iter_values: return mcc -def calculate_symmetric_min_max_df_values(df_values: np.ndarray): +def _calculate_symmetric_min_max_df_values(df_values: np.ndarray): df_min = df_values.min() df_max = df_values.max() @@ -58,40 +105,3 @@ def calculate_symmetric_min_max_df_values(df_values: np.ndarray): df_min = df_max * -1.0 return df_min, df_max - - -def create_symmetric_mcc_min_max_scaler( - df_values: np.ndarray, - labels: np.ndarray, - num_mcc_samples: int, - mcc_threshold: float, -): - df_min, df_max = df_values.min(), df_values.max() - iter_values = calculate_iter_values(df_min, df_max, num_mcc_samples) - mcc = calculate_mcc_values(df_values, labels, iter_values) - - target_df_values = calculate_target_df_values(mcc, iter_values, mcc_threshold) - target_df_min, target_df_max = calculate_symmetric_min_max_df_values( - target_df_values - ) - - # Repeat process of MCC optimization after target df-value range was calculated - # in order to increase precision - target_iter_values = calculate_iter_values( - target_df_min, target_df_max, num_mcc_samples - ) - target_mcc = calculate_mcc_values(df_values, labels, target_iter_values) - target_df_values = calculate_target_df_values( - target_mcc, target_iter_values, mcc_threshold - ) - target_df_min, target_df_max = calculate_symmetric_min_max_df_values( - target_df_values - ) - - return create_min_max_scaler(target_df_min, target_df_max) - - -def create_symmetric_min_max_scaler(df_values: np.ndarray): - df_min, df_max = calculate_symmetric_min_max_df_values(df_values) - - return create_min_max_scaler(df_min, df_max) diff --git a/amides/amides/sigma.py b/amides/amides/sigma.py index e724f8e..d2d8524 100644 --- a/amides/amides/sigma.py +++ b/amides/amides/sigma.py @@ -7,7 +7,7 @@ from luqum.tree import NoneItem from amides.data import DataBunch, TrainTestValidSplit, TrainTestSplit -from amides.events import Events, EventsError +from amides.events import Events from amides.utils import ( read_json_file, read_yaml_file, @@ -348,7 +348,7 @@ def _load_rule_filter(self, rule_path): rules = read_yaml_file(rule_path) self._filter = self._extract_rule_filters(rules) self._name = self._extract_rule_name(rules) - except (TypeError, IndexError) as err: + except (TypeError, IndexError, FileNotFoundError) as err: raise RuleDatasetError(self._name, "No rule filter available") from err def _extract_rule_filters(self, rules): @@ -376,7 +376,7 @@ def _load_properties(self, events_dir_path): try: properties = read_yaml_file(properties_path) return properties[0] - except IndexError as err: + except (IndexError, TypeError, FileNotFoundError) as err: raise RuleDatasetError(self._name, "No properties.yml available") from err def _is_evasion_possible(self, properties): @@ -490,8 +490,8 @@ class RuleSetDataset: dir_name_rule_type_map = { "process_creation": RuleType.WINDOWS_PROCESS_CREATION, "registry_event": RuleType.WINDOWS_REGISTRY_EVENT, - "web": RuleType.WEB_PROXY, - "proxy": RuleType.WEB_PROXY, + "powershell": RuleType.WINDOWS_POWERSHELL, + "proxyweb": RuleType.WEB_PROXY, } def __init__(self, name=None, set_type=None): @@ -878,8 +878,8 @@ def _load_and_add_rule_data( rule_dir_name, rule_set_events_path, rule_set_rules_path ) self._add_rule_dataset(rule_data) - except (EventsError, RuleDatasetError) as err: - _logger.error(err) + except RuleDatasetError as err: + _logger.info(err) def _load_rule_data(self, rule_dir_name, rule_set_events_path, rule_set_rules_path): rule_data_events_path = os.path.join(rule_set_events_path, rule_dir_name) diff --git a/amides/amides/utils.py b/amides/amides/utils.py index d012560..c10a53f 100644 --- a/amides/amides/utils.py +++ b/amides/amides/utils.py @@ -1,5 +1,4 @@ -""" This module contains functions for general purpose use, e.g. opening files or - fetching directory contents. +""" This module contains functions for general purpose use, e.g. opening files or fetching directory contents. """ import os import re @@ -23,11 +22,24 @@ class TimeRangeIterator: """ def __init__(self, start: str, end: str, interval: str): + """Create TimeRangeIterator. + + Parameters + ---------- + start : str + Starting timestamp in ISO8601 format + end: str + Ending timestamp in ISO8601 format + interval: str + Interval in 'HH:MM:SS.s+' + """ self._start = self._parse_timestamp(start) self._end = self._parse_timestamp(end) + self._interval = self._parse_interval(interval) def next(self): + """Returns the next timestamp value.""" current_start = self._start current_end = current_start + self._interval yield current_start.isoformat(), current_end.isoformat() @@ -54,6 +66,17 @@ def load_args_from_file( ) -> argparse.Namespace: """Loads command line arguments from config file and puts values in args.Namsespace object. + + Parameters + ---------- + parser : argparse.ArgumentParser + The ArgumentParser instance + path: str + Path of the configuration file (.json) + + Returns + ------- + args: Optional[argparse.ArgumentParser] """ config_dict = read_json_file(path) if config_dict: @@ -97,12 +120,6 @@ def get_file_names(path): ------- file_names: List[str] List of file names. - - Raises - ------ - OSError - If path does not lead to existing directory. - """ file_names = [] for entry in sorted(os.listdir(path)): @@ -125,12 +142,6 @@ def get_file_paths(path): ------- paths: List[str] List of file paths. - - Raises - ------ - OSError - In case directory does not exist or cannot be accessed. - """ file_paths = [] for entry in sorted(os.listdir(path)): @@ -154,12 +165,6 @@ def get_dir_names(path): ------- dir_names: List[str] List of directory names. - - Raises - ------ - OSError - In case target directory does not exist or cannot be accessed. - """ dir_names = [] for entry in sorted(os.listdir(path)): @@ -181,17 +186,14 @@ def read_yaml_file(path): Returns ------- - yaml-data: Optional[Dict] - Dict-representation of the loaded .y(a)ml-file or None. + yaml-data: Dict + Dict-representation of the loaded .y(a)ml-file. """ - try: - with open(path, "r") as f: - docs = yaml.safe_load_all(f) - return list(docs) - except FileNotFoundError as err: - _logger.error(err) - return None + + with open(path, "r", encoding="utf-8") as f: + docs = yaml.safe_load_all(f) + return list(docs) def read_json_file(path): @@ -210,7 +212,7 @@ def read_json_file(path): """ try: - with open(path, "r") as f: + with open(path, "r", encoding="utf-8") as f: data = json.load(f) return data @@ -236,7 +238,7 @@ def read_jsonl_file(path): """ try: json_objects = [] - with open(path, "r") as f: + with open(path, "r", encoding="utf-8") as f: for line in f: json_objects.append(json.loads(line)) @@ -248,7 +250,8 @@ def read_jsonl_file(path): def execution_time(func): """ - Wrapper for functions and methods that measures execution time in ms + Wrapper for functions and methods that measures execution time in + milliseconds (ms) Parameters ---------- diff --git a/amides/amides/visualization.py b/amides/amides/visualization.py index 4ee816f..07569af 100644 --- a/amides/amides/visualization.py +++ b/amides/amides/visualization.py @@ -1,6 +1,5 @@ """This module contains components used for the visualization of -classification results. - +evaluation results and other data. """ import matplotlib.pyplot as plt import seaborn as sbn @@ -142,7 +141,7 @@ def data(self, data): self._data = data def file_name(self): - return f"combined_dist_{self._name}" + return self._name def plot(self): self._figure, self._ax = plt.subplots(figsize=(6.4, 3.2)) @@ -428,9 +427,9 @@ def timestamp(self, timestamp): def file_name(self): if self._timestamp: - file_name = f"prt_plot_{self._name}_{self._timestamp}" + file_name = f"{self._name}_{self._timestamp}" else: - file_name = f"prt_plot_{self._name}" + file_name = self._name return file_name @@ -572,7 +571,10 @@ def __init__(self, name=None, timestamp=None): super().__init__(timestamp, name) def file_name(self): - return f"multi_pr_plot_{self._name}_{self._timestamp}" + if self._timestamp: + return f"{self._name}_{self._timestamp}" + else: + return self._name def plot(self): self._figure, self._ax = plt.subplots(figsize=(6.4, 3.2)) @@ -692,9 +694,9 @@ def __init__(self, name=None, timestamp=None): def file_name(self): if self._timestamp: - file_name = f"multi_pr_plot_{self._name}_{self._timestamp}" + file_name = f"{self._name}_{self._timestamp}" else: - file_name = f"multi_pr_plot_{self._name}" + file_name = self._name return file_name diff --git a/amides/bin/add_scaler.py b/amides/bin/add_scaler.py index 9543cf4..8b08768 100755 --- a/amides/bin/add_scaler.py +++ b/amides/bin/add_scaler.py @@ -1,4 +1,9 @@ #!/usr/bin/env python3 +"""This script takes a trained (and validated) model and fits a symmetric min-max scaler from +decision function values. The decision function values are computed either using feature vectors +of training data or validation data. The range of the symmetric min-max scaler is either determined +by a specific MCC threshold range or the highest absolute decision function value. +""" import sys import argparse diff --git a/amides/bin/combine_models.py b/amides/bin/combine_models.py index 2a4b67e..3d124e8 100755 --- a/amides/bin/combine_models.py +++ b/amides/bin/combine_models.py @@ -1,4 +1,9 @@ #!/usr/bin/env python3 +"""This script prepares misuse classification models and rule attribution models for the +operational use with the AMIDES Logprep processor. Both models are extracted from the +corresponding (Multi)ValidationResult and pickled, together with feature extractors and +scalers, into a single archive (.zip). +""" import os import sys @@ -121,13 +126,13 @@ def main(): "--single", type=str, action="store", - help="Path to results file (TrainingResult, ValidationResult) containing the single model", + help="Path of the TrainingResult containing the misuse classification model", ) parser.add_argument( "--multi", type=str, action="store", - help="Path to results file (ValidationResults) holding the multi model", + help="Path of the MultiValidationResult with the rule attribution model", ) parser.add_argument( "--out-dir", type=str, action="store", help="Specify output directory" diff --git a/amides/bin/confidence.py b/amides/bin/confidence.py index ac71534..b66646f 100755 --- a/amides/bin/confidence.py +++ b/amides/bin/confidence.py @@ -1,4 +1,13 @@ #!/usr/bin/env python3 +"""This script creates a listing of + (1) Sample + (2) Normalized sample (String of tokens that are left over after normalization) + (3) Features extracted using the feature extractor + (4) Number of entries in the feature vector that are non-zero + (5) Decision function value + (6) Confidence value (scaled decision function value) +using a trained misuse classifcation model and its feature extractor, on a list of samples. +""" import sys import os @@ -282,7 +291,7 @@ def parse_args_and_options(): "--benign-samples", action="store", type=str, - help="Path of a benign samples file", + help="Path of file (.txt) containing benign samples", ) parser.add_argument( "--sigma-dir", action="store", type=str, help="Path of the sigma data directory" diff --git a/amides/bin/config/pr_plot_powershell_proxy_registry.json b/amides/bin/config/pr_plot_powershell_proxy_registry.json index 49a3ccf..64761a9 100644 --- a/amides/bin/config/pr_plot_powershell_proxy_registry.json +++ b/amides/bin/config/pr_plot_powershell_proxy_registry.json @@ -7,5 +7,5 @@ "type": "multi_prt", "save": true, "out_dir": "plots", - "title": "misuse_svc_rules_proxy_registry_powershell_f1" + "title": "figure_6_c5_classification_other_types" } diff --git a/amides/bin/config/process_creation/attr_plot.json b/amides/bin/config/process_creation/attr_plot.json index e9bcce2..7316a32 100644 --- a/amides/bin/config/process_creation/attr_plot.json +++ b/amides/bin/config/process_creation/attr_plot.json @@ -2,5 +2,5 @@ "eval_result": "models/process_creation/eval_rl_attr.zip", "plot": "combined", "out_dir": "plots/process_creation", - "title": "attr_svc_rules_f1" + "title": "figure_4_c3_rule_attribution" } diff --git a/amides/bin/config/process_creation/pr_plot_tainted.json b/amides/bin/config/process_creation/pr_plot_tainted.json index be77525..328066f 100644 --- a/amides/bin/config/process_creation/pr_plot_tainted.json +++ b/amides/bin/config/process_creation/pr_plot_tainted.json @@ -37,5 +37,5 @@ "models/process_creation/tainted/eval/eval_rslt_misuse_svc_rules_tainted_30_f1_9.zip" ], "out_dir": "plots/process_creation", - "title": "misuse_rules_svc_tainted_f1" + "title": "figure_5_c4_tainted_training" } diff --git a/amides/bin/config/process_creation/prt_plot_misuse_rules_matches.json b/amides/bin/config/process_creation/prt_plot_misuse_rules_matches.json index d211dff..b2fbf6a 100644 --- a/amides/bin/config/process_creation/prt_plot_misuse_rules_matches.json +++ b/amides/bin/config/process_creation/prt_plot_misuse_rules_matches.json @@ -6,5 +6,5 @@ "type": "prt", "out_dir": "plots/process_creation", "save": true, - "title": "misuse_svc_rules_matches_f1" + "title": "figure_3_c1_c2_misuse_classification" } diff --git a/amides/bin/empty_attr.py b/amides/bin/empty_attr.py deleted file mode 100755 index 8ed3003..0000000 --- a/amides/bin/empty_attr.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env python3 - - -import sys -import argparse -import functools - -from amides.persist import Dumper, PersistError -from amides.utils import get_logger, set_log_level - - -set_log_level("info") -logger = get_logger(__name__) - -dumper = None - - -def init_dumper(out_dir): - global dumper - - try: - if not dumper: - dumper = Dumper(out_dir) - - except OSError as err: - logger.err(err) - - -def load_pickled_object(path): - try: - return dumper.load_object(path) - except (TypeError, PersistError) as err: - logger.error(err) - sys.exit(1) - - -def save_object(obj): - dumper.save_object(obj) - - -def rsetattribute(obj, attribute, value): - pre, _, post = attribute.rpartition(".") - - return setattr(rgetattribute(obj, pre) if pre else obj, post, value) - - -def rgetattribute(obj, attribute, *args): - def _getattr(obj, attribute): - return getattr(obj, attribute, *args) - - return functools.reduce(_getattr, [obj] + attribute.split(".")) - - -def empty_attribute(object_paths, attribute, out_dir): - init_dumper(out_dir) - - for object_path in object_paths: - obj = load_pickled_object(object_path) - - try: - rsetattribute(obj, attribute, None) - except AttributeError as err: - logger.error(err) - sys.exit(1) - - save_object(obj) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--object", type=str, action="append", help="Path of the pickled object file" - ) - parser.add_argument( - "--attribute", - type=str, - action="store", - help="Attribute which should be removed", - ) - parser.add_argument( - "--out-dir", type=str, action="store", help="Attribute which should be removed" - ) - - args = parser.parse_args() - - empty_attribute(args.object, args.attribute, args.out_dir) - - -if __name__ == "__main__": - main() diff --git a/amides/bin/eval_attr.py b/amides/bin/eval_attr.py old mode 100755 new mode 100644 diff --git a/amides/bin/eval_mcc_scaling.py b/amides/bin/eval_mcc_scaling.py index f7b6bbb..6d2f235 100755 --- a/amides/bin/eval_mcc_scaling.py +++ b/amides/bin/eval_mcc_scaling.py @@ -1,11 +1,14 @@ #!/usr/bin/env python3 +"""This script takes already validated misuse classification models and builds a symmetric min-max scaler +using the mcc values and a pre-defined mcc value threshold. Afterwards, the model is evaluated using the +newly calibrated scaler. The number of evaluation thresholds (or evaluation iterations) is configurable. +""" import os import sys import argparse import numpy as np import itertools -import time from sklearn.metrics import matthews_corrcoef, precision_score, recall_score from sklearn.preprocessing import MinMaxScaler @@ -27,7 +30,6 @@ zero_null_vectors = False mcc_threshold = 0.1 -multi_scaling = False dumper = None save = False @@ -243,53 +245,6 @@ def prepare_values_and_labels( return df_values, labels -def evaluate_multi_mcc_scaling_with_optimum_shift(): - timestamp = get_current_timestamp() - target_df_min, target_df_max = None, None - - result_paths = list(itertools.zip_longest(valid_results, mcc_values)) - - for i, valid_rslt_path, mcc_val_path in enumerate(result_paths): - df_values, labels, result_name = load_valid_result(valid_rslt_path) - - df_iter_values = calculate_iter_values(df_values.min(), df_values.max()) - mcc = calculate_mcc(df_iter_values, labels) - - if i == 0: - ( - target_df_min, - target_df_max, - ) = get_target_df_min_max_values(mcc, df_iter_values) - target_iter_values = calculate_iter_values(target_df_min, target_df_max) - - target_mcc = load_mcc_values(mcc_val_path) - if not target_mcc: - target_mcc = calculate_mcc(df_values, labels, target_iter_values) - save_mcc_values(target_mcc, target_iter_values, result_name, timestamp) - - shift_value = calculate_shifting_value(target_mcc, target_iter_values) - shifted_df_values = df_values - shift_value - - scaler = create_min_max_scaler(target_df_min, target_df_max) - scaled_df_values = scaler.transform(shifted_df_values[:, np.newaxis]).flatten() - - scaled_iter_values = calculate_iter_values(0, 1) - - precision = calculate_precision(scaled_df_values, labels, scaled_iter_values) - recall = calculate_recall(scaled_df_values, labels, scaled_iter_values) - - # Save result using original result name - eval_result = BinaryEvaluationResult( - thresholds=scaled_iter_values, - name=result_name, - timestamp=timestamp, - ) - eval_result.precision = precision - eval_result.recall = recall - - save_eval_result(eval_result) - - def evaluate_mcc_scaling_with_optimum_shift(): result_paths = list(itertools.zip_longest(valid_results, train_results, mcc_values)) @@ -342,13 +297,6 @@ def evaluate_mcc_scaling_with_optimum_shift(): save_eval_result(eval_result) -def evaluate_mcc_scaling(): - if multi_scaling: - evaluate_multi_mcc_scaling_with_optimum_shift() - else: - evaluate_mcc_scaling_with_optimum_shift() - - def parse_args_and_options(parser): args = parser.parse_args() @@ -377,10 +325,6 @@ def parse_args_and_options(parser): global mcc_values mcc_values = args.mcc_values - if args.multi_scaling: - global multi_scaling - multi_scaling = args.multi_scaling - if args.zero_null_vecs: global zero_null_vectors zero_null_vectors = args.zero_null_vecs @@ -415,10 +359,10 @@ def main(): help="Path of the associated MCC values (if available)", ) parser.add_argument( - "--multi-scaling", - action="store_true", - default=False, - help="Specifiy if same scaling should be applied to multiple results", + "--num-eval-thresholds", + action="store", + default=50, + help="Specifiy the number of evaluation thresholds", ) parser.add_argument( "--zero-null-vecs", @@ -434,7 +378,7 @@ def main(): parse_args_and_options(parser) - evaluate_mcc_scaling() + evaluate_mcc_scaling_with_optimum_shift() if __name__ == "__main__": diff --git a/amides/bin/extract_features.py b/amides/bin/extract_features.py index 4985a12..8a02ae1 100755 --- a/amides/bin/extract_features.py +++ b/amides/bin/extract_features.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +"""This script extracts the vocabulary learned by feature extractors and the corresponding +weights from the fitted SVC and sorts them according to the feature weight. The top n +features with the highest/lowest weight can be plotted.""" import os import sys diff --git a/amides/bin/extract_terms.py b/amides/bin/extract_terms.py index a2b55de..0298cf7 100644 --- a/amides/bin/extract_terms.py +++ b/amides/bin/extract_terms.py @@ -1,57 +1,34 @@ -from luqum.parser import parser -from amides.features.normalize import Normalizer -from luqum.visitor import TreeVisitor -from luqum.tree import NoneItem -from argparse import ArgumentParser +#!/usr/bin/env python3 +"""This script enables to extract values of different search fields from Sigma rules. Extracted search field values can be additionally +normalized, i.e. split into lists of token strings using preprocessing, tokenization, and token elimination. +""" + + import glob import os import yaml import itertools +from luqum.parser import parser +from argparse import ArgumentParser - -class MultiFieldVisitor(TreeVisitor): - def __init__(self, fields): - super(MultiFieldVisitor, self).__init__(track_parents=False) - self._fields = fields - self._values = [] - - @property - def values(self): - return self._values - - def visit_search_field(self, node, context): - match = False - for field in self._fields: - if node.name == field or node.name.startswith(field + "|"): - match = True - if match: - context = self.child_context(node, NoneItem(), context) - context[node.name.split("|")[0]] = True - yield from self.generic_visit(node, context) - - def visit_phrase(self, node, context): - for field in self._fields: - if context.get(field, False): - if node.value.startswith('"') and node.value.endswith('"'): - self._values.append(node.value[1:-1]) - else: - self._values.append(node.value) - yield from self.generic_visit(node, context) - - def visit_not(self, node, context): - yield NoneItem() +from amides.features.normalize import Normalizer +from amides.sigma import MultiFieldVisitor def main(): args = parse_args() fields_of_interest = args.fields.split(",") rule_filters = [doc["filter"] for doc in read_rules(args.dir)] - terms_per_filter = [terms_of_interest(filter, fields_of_interest) for filter in rule_filters] + terms_per_filter = [ + terms_of_interest(filter, fields_of_interest) for filter in rule_filters + ] print_terms(terms_per_filter, args.normalize) def parse_args(): - parser = ArgumentParser(description="Extract search terms from converted Sigma rules.") + parser = ArgumentParser( + description="Extract search terms from converted Sigma rules." + ) parser.add_argument("--normalize", action="store_true", help="normalize output") parser.add_argument("dir", help="rule directory") parser.add_argument("fields", help="comma-separated list of field names") @@ -62,7 +39,7 @@ def read_rules(dir): docs = [] rule_files = glob.glob(os.path.join(dir, "*.yml")) for rule_file in rule_files: - with open(rule_file) as f: + with open(rule_file, "r", encoding="utf-8") as f: docs.extend(yaml.safe_load_all(f)) return docs diff --git a/amides/bin/extract_terms_multi.py b/amides/bin/extract_terms_multi.py index c37ee57..997c9bd 100755 --- a/amides/bin/extract_terms_multi.py +++ b/amides/bin/extract_terms_multi.py @@ -1,49 +1,18 @@ #!/usr/bin/env python3 - -from luqum.parser import parser -from amides.features.normalize import Normalizer -from luqum.visitor import TreeVisitor -from luqum.tree import NoneItem -from argparse import ArgumentParser -from pathlib import Path +"""This script enables to extract values of different search fields from Sigma rules. Extracted search field values can be additionally +normalized, i.e. split into lists of token strings using preprocessing, tokenization, and token elimination. +""" import glob import os import yaml import json - -class MultiFieldVisitor(TreeVisitor): - def __init__(self, fields): - super(MultiFieldVisitor, self).__init__(track_parents=False) - self._fields = fields - self._values = [] - - @property - def values(self): - return self._values - - def visit_search_field(self, node, context): - match = False - for field in self._fields: - if node.name == field or node.name.startswith(field + "|"): - match = True - if match: - context = self.child_context(node, NoneItem(), context) - context[node.name.split("|")[0]] = True - yield from self.generic_visit(node, context) - - def visit_phrase(self, node, context): - for field in self._fields: - if context.get(field, False): - if node.value.startswith('"') and node.value.endswith('"'): - self._values.append(node.value[1:-1]) - else: - self._values.append(node.value) - yield from self.generic_visit(node, context) - - def visit_not(self, node, context): - yield NoneItem() +from luqum.parser import parser +from amides.features.normalize import Normalizer +from amides.sigma import MultiFieldVisitor +from argparse import ArgumentParser +from pathlib import Path def main(): @@ -54,7 +23,7 @@ def main(): normalizer = Normalizer(max_len_num_values=3) rules = [] for rule_file in rule_files: - with open(rule_file) as f: + with open(rule_file, "r", encoding="utf-8") as f: docs = list(yaml.safe_load_all(f)) rule = {} # rule["title"] = docs[0]["pre_detector"]["title"] diff --git a/amides/bin/normalize.py b/amides/bin/normalize.py index 5a7caca..a24bc88 100755 --- a/amides/bin/normalize.py +++ b/amides/bin/normalize.py @@ -1,4 +1,9 @@ #! /usr/bin/env python3 +"""This script normalizes benign samples, matches, rule filter, and evasions into list +of tokens. The normalization performed in this script is the same normalization +that is performed by AMIDES during operational use. Normalized/converted samples +are written into a separate output file. +""" import argparse import sys @@ -73,7 +78,7 @@ def extract_tokens(cmdline): def samples_file(samples_file_path: str): try: - with open(samples_file_path, "r") as in_file: + with open(samples_file_path, "r", encoding="utf-8") as in_file: for line in in_file: stripped = line.rstrip("\n") @@ -160,14 +165,14 @@ def main(): parser.add_argument( "--sigma-dir", action="store", - default=os.path.join(base_dir, "Daten/Sigma-Studie"), + default=os.path.join(base_dir, "data/sigma"), ) parser.add_argument( "-o", "--out-file", type=str, action="store", - default=os.path.join(os.getcwd(), "tokens.out"), + default=os.path.join(os.getcwd(), "tokens.txt"), ) args = parser.parse_args() diff --git a/amides/bin/plot_attr.py b/amides/bin/plot_attr.py index f99a7d3..aadd8bb 100644 --- a/amides/bin/plot_attr.py +++ b/amides/bin/plot_attr.py @@ -1,4 +1,10 @@ #!/usr/bin/env python3 +"""This script illustrates the evaluation results of the rule attribution model evaluation. Depending +on the specified plot option, this means: + (1) Distribution of the position of the correct rule evaded in the ranked list of potentially evaded rules + (2) Cumulative distribution of the position of the correct rule evaded in the ranked list of potentially evaded rules + (3) Both +""" import sys import os @@ -12,7 +18,6 @@ CombinedDistributionPlot, ) from amides.utils import ( - get_current_timestamp, get_logger, set_log_level, load_args_from_file, @@ -128,23 +133,21 @@ def main(): "--eval-result", type=str, action="store", - help="Path to a pickled evaluation result", + help="Path to the RuleAttributionEvaluationResult", ) parser.add_argument( "--plot", type=str, action="store", choices=["dist", "cum_dist", "combined"], - help="Type of result plot which should be created", + help="Type of plot that should be created", ) + parser.add_argument("--out-dir", type=str, action="store", help="Output directory") parser.add_argument( - "--out-dir", type=str, action="store", help="Output directory to save plots" + "--title", type=str, action="store", help="Title of the final figure" ) parser.add_argument( - "--title", type=str, action="store", help="Title of the multi PR diagram" - ) - parser.add_argument( - "--config", type=str, action="store", help="Path to config file." + "--config", type=str, action="store", help="Path of the config file." ) args = parse_args_and_options(parser) diff --git a/amides/bin/plot_df_hist.py b/amides/bin/plot_df_hist.py index 4b17ce2..29250ab 100755 --- a/amides/bin/plot_df_hist.py +++ b/amides/bin/plot_df_hist.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +"""This script creates a histogram of decision function values of benign samples created by a misuse +classification model. +""" import seaborn import os diff --git a/amides/bin/plot_df_values.py b/amides/bin/plot_df_values.py index 087dbc9..aa5db18 100644 --- a/amides/bin/plot_df_values.py +++ b/amides/bin/plot_df_values.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python3 +"""This script is used to create swarm-,box-, and violin-plots of decision function values of benign samples, +rules, and evasions. +""" + import argparse import sys import os @@ -166,7 +171,7 @@ def main(): "valid_result", type=str, action="store", - help="Path of result file containing trained (or calibrated) estimator", + help="Path of pickled ValidationResult", ) parser.add_argument( "file_name", type=str, action="store", help="Name of the output file" diff --git a/amides/bin/plot_multi_tainted.py b/amides/bin/plot_multi_tainted.py index 16016ea..b35b825 100644 --- a/amides/bin/plot_multi_tainted.py +++ b/amides/bin/plot_multi_tainted.py @@ -1,4 +1,10 @@ #!/usr/bin/env python3 +"""This script is used to create a precision-recall plot of multiple evaluation results of models whose training data has been tainted. +The idea of this plot is to visualize the influence of different fractions of tainted training data on the classification performance. +For better comparison, a baseline result (.e.g. with the same parameters, but without tainted data) is provided. +In case of multiple evaluation results (for multiple fractions of tainted data), the script calculates the average precision and +recall for each fraction of tainting and plots the average values into the figure. +""" import sys import os @@ -137,28 +143,28 @@ def main(): "-b", type=str, action="store", - help="Path of the base evaluation result without tainted benign samples.", + help="Path of the baseline evaluation result (without tainted benign samples)", ) parser.add_argument( "--low-tainted", "-lt", type=str, action="append", - help="Path of evaluation result with low percentage of tainted samples (10%)", + help="Path of evaluation result(s) with low tainting (10%)", ) parser.add_argument( "--medium-tainted", "-mt", type=str, action="append", - help="Path of evaluation result with medium percentage of tainted samples (20%)", + help="Path of evaluation result(s) with medium tainting (20%)", ) parser.add_argument( "--high-tainted", "-ht", type=str, action="append", - help="Path of evaluation result with high percentage of tainted samples (30%)", + help="Path of evaluation result(s) with high tainting (30%)", ) parser.add_argument( "--out-dir", "-o", type=str, action="store", help="Path of the output directory" diff --git a/amides/bin/plot_pr.py b/amides/bin/plot_pr.py index 04b503a..86b454c 100755 --- a/amides/bin/plot_pr.py +++ b/amides/bin/plot_pr.py @@ -1,4 +1,8 @@ #!/usr/bin/env python3 +"""This script is used to create plots of evaluation results of misuse classification models. The script takes +precision, recall, f1-score, and mcc-values from the EvaluationResult provided and illustrates them in a +so called precision-recall-thresholds plot. The final plot is saved as .pdf-file into the specified output location. +""" import sys import os @@ -151,14 +155,14 @@ def main(): "--result", type=str, action="append", - help="Path to a pickled evaluation result", + help="Path of a pickled EvaluationResult whose data should be visualized", ) parser.add_argument( "--type", action="store", choices=["prt", "multi_prt"], default="prt", - help="Specifiy which type of precision-recall visualization should be made.", + help="Specifiy which type of precision-recall plot should be made", ) parser.add_argument("--save", action="store_true", help="Save plots to file(s)") parser.add_argument( @@ -167,13 +171,13 @@ def main(): parser.add_argument( "--interactive", action="store_true", - help="Show plot when computation is finished. Requires manual interaction to close window and finish the script.", + help="Show plot when computation is finished. Requires manual interaction to close window and finish the script", ) parser.add_argument( "--title", type=str, action="store", - help="Title of the precision-recall diagram.", + help="Title of the precision-recall diagram", ) parser.add_argument( "--config", type=str, action="store", help="Path to config file." diff --git a/amides/bin/results.sh b/amides/bin/results.sh deleted file mode 100755 index e67ce0d..0000000 --- a/amides/bin/results.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash - -# Creating misuse classification results for 'process_creation' rules using SOCBED data -python3 bin/train.py --config bin/config/process_creation/train_misuse_svc_rules.json -python3 bin/train.py --config bin/config/process_creation/train_misuse_svc_matches.json -python3 bin/validate.py --config bin/config/process_creation/validate_misuse_svc_rules.json -python3 bin/validate.py --config bin/config/process_creation/validate_misuse_svc_matches.json -python3 bin/eval_mcc_scaling.py --config bin/config/process_creation/eval_misuse_svc_rules.json -python3 bin/eval_mcc_scaling.py --config bin/config/process_creation/eval_misuse_svc_matches.json -python3 bin/plot_pr.py --config bin/config/process_creation/prt_plot_misuse_rules_matches.json - -# Creating rule attribution results for 'process_creation' rules using SOCBED data -cat models/process_creation/train_rslt_misuse_svc_rules_f1_0_info.json | jq .estimator_params > bin/config/process_creation/params.json -python3 bin/train.py --config bin/config/process_creation/train_attr_svc_rules.json -python3 bin/eval_attr.py --config bin/config/process_creation/eval_attr.json -python3 bin/plot_attr.py --config bin/config/process_creation/attr_plot.json - -# Creating tainted classification results for 'process_creation' events using SOCBED data -python3 bin/train.py --config bin/config/process_creation/train_misuse_svc_rules_tainted_10.json -python3 bin/train.py --config bin/config/process_creation/train_misuse_svc_rules_tainted_20.json -python3 bin/train.py --config bin/config/process_creation/train_misuse_svc_rules_tainted_30.json -python3 bin/validate.py --config bin/config/process_creation/validate_misuse_svc_rules_tainted_10.json -python3 bin/validate.py --config bin/config/process_creation/validate_misuse_svc_rules_tainted_20.json -python3 bin/validate.py --config bin/config/process_creation/validate_misuse_svc_rules_tainted_30.json -python3 bin/eval_mcc_scaling.py --config bin/config/process_creation/eval_misuse_svc_rules_tainted.json -python3 bin/plot_multi_tainted.py --config bin/config/process_creation/pr_plot_tainted.json - - -# Creating misuse classification results for 'powershell', 'proxy_web', and 'registry' rules -python3 bin/train_new_types.py --config bin/config/powershell/train_misuse_svc_rules.json -python3 bin/train_new_types.py --config bin/config/proxy_web/train_misuse_svc_rules.json -python3 bin/train_new_types.py --config bin/config/registry/train_misuse_svc_rules.json - -python3 bin/validate_new_types.py --config bin/config/powershell/validate_misuse_svc_rules.json -python3 bin/validate_new_types.py --config bin/config/proxy_web/validate_misuse_svc_rules.json -python3 bin/validate_new_types.py --config bin/config/registry/validate_misuse_svc_rules.json - -python3 bin/eval_mcc_scaling.py --config bin/config/powershell/eval_misuse_svc_rules.json -python3 bin/eval_mcc_scaling.py --config bin/config/proxy_web/eval_misuse_svc_rules.json -python3 bin/eval_mcc_scaling.py --config bin/config/registry/eval_misuse_svc_rules.json -python3 bin/plot_pr.py --config bin/config/pr_plot_powershell_proxy_registry.json - - - - diff --git a/amides/bin/split_terms.py b/amides/bin/split_terms.py index a3f558e..1cde0c2 100644 --- a/amides/bin/split_terms.py +++ b/amides/bin/split_terms.py @@ -1,18 +1,25 @@ +#!/usr/bin/env/python3 +"""Assuming samples are stored in .txt or .jsonl-files, this script can be used to randomly split the samples into half +and save them in two separate files (Assuming the given file contains one sample per line). +""" + import argparse import random def main(): args = parse_args() - lines = open(args.file).readlines() + lines = open(args.file, encoding="utf-8").readlines() random.shuffle(lines) half = int(len(lines) / 2) - open(args.file + "_train", "w").writelines(lines[:half]) - open(args.file + "_test", "w").writelines(lines[half:]) + open(args.file + "_train", "w", encoding="utf-8").writelines(lines[:half]) + open(args.file + "_test", "w", encoding="utf-8").writelines(lines[half:]) def parse_args(): - parser = argparse.ArgumentParser(description="Randomly split text file into training and test files") + parser = argparse.ArgumentParser( + description="Randomly split text file into training and test files" + ) parser.add_argument("file", help="filename") return parser.parse_args() diff --git a/amides/bin/train.py b/amides/bin/train.py index 05a3996..add94cb 100755 --- a/amides/bin/train.py +++ b/amides/bin/train.py @@ -1,4 +1,15 @@ #!/usr/bin/env python3 +""" +This script is used to train models for the AMIDES misuse classification and rule attribution components. Models are +trained using feature vectors extracted from benign samples of enterprise networks and Sigma rule filters or matches +(i.e. events triggering SIEM detectio rules) serving as malicious samples. + +Benign samples are provided in .txt-files, one sample per line. Sigma rule data(rule filters, matches, evasions) are provided +in folders with .json files, one element per file. + +The trained model, converted training data, the feature extractor, as well as the scaler are pickled and saved into a single .zip-archive. +The archive is accompanied by a JSON-file holding meta information on the produced results in human readable format. +""" import os import sys @@ -91,6 +102,7 @@ num_mcc_samples = 50 tainted_benign_samples = 0.0 +tainted_random_seed = 42 tainted_sample_seedings = [] num_subprocesses = 1 @@ -163,7 +175,7 @@ def prepare_benign_sample_tainting(): sys.exit(1) global tainted_sample_seedings - random.seed(42) + random.seed(tainted_random_seed) for _ in range(num_iterations): tainted_sample_seedings.append(random.randint(0, 100)) @@ -511,6 +523,10 @@ def parse_args_and_options(parser: argparse.ArgumentParser): global tainted_benign_samples tainted_benign_samples = args.tainted_benign_samples / 100.0 + if args.tainted_seed: + global tainted_random_seed + tainted_random_seed = args.tainted_seed + if args.vectorization: global vectorization vectorization = args.vectorization @@ -580,39 +596,41 @@ def parse_args_and_options(parser: argparse.ArgumentParser): def main(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser( + description="Train misuse and rule attribution models for AMIDES" + ) parser.add_argument( "--benign-samples", type=str, nargs="?", action="store", - help="Path to benign samples used for training.", + help="Path of the benign training samples file (.txt)", ) parser.add_argument( "--deduplicate", action="store_true", - help="Perform training data deduplication 'on-the-fly' before model training", + help="Perform deduplication of benign samples before model training", ) parser.add_argument( "--normalize", type=str, nargs="?", action="store", - help="Normalize benign samples before training.", + help="Normalize benign samples before training", ) parser.add_argument( "--events-dir", type=str, nargs="?", action="store", - help="Path of the directory where the rule set evasions (and matches) are located.", + help="Path of the directory with Sigma rule matches and evasions (.json)", ) parser.add_argument( "--rules-dir", type=str, nargs="?", action="store", - help="Path of the directory where the rule set rule data is located.", + help="Path of the directory with Sigma detection rules (.yml)", ) parser.add_argument( "--model-type", @@ -626,20 +644,26 @@ def main(): type=str, action="store", choices=["rule_filters, matches"], - help="Type of malicious samples to be used for training", + help="Specifies the type of malicious samples used for training", ) parser.add_argument( "--tainted-benign-samples", type=float, action="store", - help="Taint benign training samples", + help="Fraction (0-100) of evasions that are used for benign samples tainting", + ) + parser.add_argument( + "--tainted-seed", + type=int, + action="store", + help="Seeding value to init benign sample tainting", ) parser.add_argument( "--vectorization", type=str, action="store", choices=["count", "binary_count", "tfidf", "hashing", "scaled_count"], - help="Specifies the vectorizer class that should be used", + help="Specifies the type of vectorization used to create feature vectors", ) parser.add_argument( "--tokenization", @@ -653,7 +677,7 @@ def main(): "ws_ast_sla_min_eq", "comma_separation", ], - help="Specifiecs the sample tokenizer (if ngram_mode == 'word')", + help="Specifiecs the sample tokenizer given to the vectorizer (if ngram_mode == 'word')", ) parser.add_argument( "--ngram-mode", @@ -666,54 +690,54 @@ def main(): "--ngram-range", type=str, action="store", - help="Specifies the n-gram range used by the vectorizer", + help="Specifies the n-gram range used by the vectorizer (Example: (1,1))", ) parser.add_argument( "--search-params", action="store_true", - help="Search the given parameter grid for optimal hyper-parameters.", - ) - parser.add_argument( - "--mcc-scaling", - action="store_true", - help="Scale decision function values using MCC-Scaling", - ) - parser.add_argument( - "--mcc-threshold", - action="store", - type=float, - help="Threshold value for MCC-Scaling", + help="Optimize the classifier by searching a given hyper parameter space", ) parser.add_argument( "--scoring", type=str, action="store", choices=["f1", "mcc"], - help="Choose the scoring function used for model evaluation", + help="Choose the scoring function used for candidate evaluation when perforing exhaustive parameter optimization", + ) + parser.add_argument( + "--cv", + type=int, + action="store", + help="Number of cross-validation splits used for parameter optimization", ) parser.add_argument( "--model-params", type=str, action="store", - help="Path to JSON-file containing parameters used for model fitting", + help="Path to JSON-file containing parameters used for just fitting the model (No parameter optimization)", ) parser.add_argument( - "--cv", - type=int, + "--mcc-scaling", + action="store_true", + help="Scale decision function values using MCC-Scaling", + ) + parser.add_argument( + "--mcc-threshold", action="store", - help="Number of cross-validation splits used for parameter optimization", + type=float, + help="Threshold value used for MCC-Scaling", ) parser.add_argument( "--num-jobs", type=int, action="store", - help="Number of parallel jobs when grid search is performed", + help="Number of parallel jobs when search candidates are evaluated during parameter optimization", ) parser.add_argument( "--num-subprocesses", type=int, action="store", - help="Number of processes used when creating the rule attribution model", + help="Number of parallel processes used to use rule models for the rule attribution model", ) parser.add_argument( "--num-iterations", @@ -724,7 +748,7 @@ def main(): parser.add_argument( "--save-data", action="store_true", - help="Specify if training data should be added to TrainingResult", + help="Specify if the transformed training data (feature vectors!) should be added to TrainingResult", ) parser.add_argument( "--out-dir", @@ -736,7 +760,7 @@ def main(): "--result-name", type=str, action="store", - help="Specifies the result's base name", + help="Specifies the result files base name", ) parser.add_argument( "--config", type=str, action="store", help="Path to config file." diff --git a/amides/bin/train_new_types.py b/amides/bin/train_new_types.py index 5763a5d..3b2f8b5 100755 --- a/amides/bin/train_new_types.py +++ b/amides/bin/train_new_types.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +"""This script trains misuse classification models for other event types.""" import sys import argparse diff --git a/amides/bin/train_new_types_multi.py b/amides/bin/train_new_types_multi.py index cca8639..d391ddf 100755 --- a/amides/bin/train_new_types_multi.py +++ b/amides/bin/train_new_types_multi.py @@ -1,4 +1,6 @@ #!/usr/bin/env python3 +"""This script trains rule attribution models for the other rule event types.""" + import argparse import json diff --git a/amides/bin/validate.py b/amides/bin/validate.py index 0eb02dc..e657491 100755 --- a/amides/bin/validate.py +++ b/amides/bin/validate.py @@ -1,4 +1,16 @@ #!/usr/bin/env python3 +"""This script is used to validate models trained for the AMIDES misuse classification and rule attribution components +using a set of benign samples and Sigma rule evasions. Benign samples are provided in the same format as for train.py. + +After loading estimator and feature extractor from the given TrainingResult, the feature extractor +transforms the given benign validation samples and Sigma evasions into feature vectors. Afterwards, the model is used +to calculate decision function values for the transformed validation samples. In case of a MultiTrainingResult, the +step is repeated for each rule model provided. + +The calculated decision function values and feature vectors are stored together with the rest of the TrainingResult into +a ValidationResult, which is then pickled. In case of a rule attribution model validation, ValidationResult objects for +each rule models are pickled into a single MultiValidationResult object. +""" import sys import os @@ -17,7 +29,7 @@ from amides.persist import Dumper, PersistError from amides.features.extraction import CommandlineExtractor from amides.features.normalize import normalize -from amides.evaluation import BinaryEvaluationResult, NegativeStateEvaluationResult +from amides.evaluation import BinaryEvaluationResult from amides.data import ( DataBunch, MultiTrainingResult, @@ -218,41 +230,6 @@ def prepare_validation_result( return valid_result -def evaluate_multi_model(multi_result: MultiValidationResult): - _logger.info("Evaluating model %s", multi_result.name) - check_benign_valid_samples() - pc_rules_data = load_pc_rules_dataset() - results = multi_result.results.values() - - result = next(results) - rule_dataset = pc_rules_data.get_rule_dataset_by_name(result.name) - validation_data = prepare_validation_data(result, rule_dataset) - - multi_eval_result = NegativeStateEvaluationResult( - thresholds=np.arange(-1, 1, 0.004), - origin_labels=validation_data.labels, - name=multi_result.name, - timestamp=multi_result.timestamp, - ) - - for result in results: - rule_dataset = pc_rules_data.get_rule_dataset_by_name(result.name) - validation_data = prepare_validation_data(result, rule_dataset) - - if result.predict is not None: - predict = result.predict - else: - predict = calculate_predict(result.estimator, validation_data.samples) - - try: - scaled_df_values = scale_df_values(predict, result.scaler) - multi_eval_result.evaluate(scaled_df_values, validation_data.labels) - except (ValueError, IndexError): - continue - - save_result(multi_eval_result) - - def validate_multi_model(multi_result: MultiTrainingResult): _logger.info("Validating model %s", multi_result.name) check_benign_valid_samples() @@ -271,8 +248,6 @@ def validate_multi_model(multi_result: MultiTrainingResult): save_result(multi_valid_result) - return multi_valid_result - def evaluate_single_model(valid_result: ValidationResult): _logger.info("Evaluating model %s", valid_result.name) @@ -333,10 +308,6 @@ def validate_model(result_path: str): evaluate_single_model(result) elif type(result) is MultiTrainingResult: valid_result = validate_multi_model(result) - if evaluate: - evaluate_multi_model(valid_result) - elif type(result) is MultiValidationResult: - evaluate_multi_model(result) else: _logger.error("Loaded object is not of supported result type. Exiting.") sys.exit(1) @@ -422,32 +393,34 @@ def parse_args_and_options(parser: argparse.ArgumentParser): def main(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser( + description="Validate misuse and rule attribution models for AMIDES" + ) parser.add_argument( "--result-path", type=str, action="append", - help="Path of a pickled TrainingResult(s) or MultiTrainingResult(s)", + help="Path of a pickled TrainingResult or MultiTrainingResult", ) parser.add_argument( "--benign-samples", type=str, action="store", - help="Path to benign events used for evaluation", + help="Path of the benign validation samples file (.txt)", ) parser.add_argument( "--events-dir", type=str, nargs="?", action="store", - help="Path of the directory where the rule set events (matches and evasions) are located", + help="Path of the directory with Sigma rule matches and evasions (.json)", ) parser.add_argument( "--rules-dir", type=str, nargs="?", action="store", - help="Path of the directory where the rule set rule data is located.", + help="Path of the directory with Sigma detection rules (.yml)", ) parser.add_argument( "--malicious-samples-type", @@ -464,24 +437,24 @@ def main(): parser.add_argument( "--adapt-scaling", action="store_true", - help="Adapt given scaling to symmetric MCC-scaling using invsere scale transformation", + help="Adapt given scaler to symmetric MCC-scaler using invsere scale transformation on the validation data", ) parser.add_argument( "--evaluate", action="store_true", - help="Evaluate the loaded model using custom evaluation method", + help="Evaluate the provided model(s) using validation data and scaled model decision function values.", ) parser.add_argument( "--num-eval-thresholds", type=int, action="store", default=50, - help="Number of evaluation thresholds when using 'custom'-mode", + help="Number of evaluation thresholds used when model(s) are evaluated", ) parser.add_argument( "--zero-to-zero", action="store_true", - help="Set prediction of all-zero vectors to zero", + help="Set decision function values of all-zero feature vectors to 0.0", ) parser.add_argument( "--out-dir", diff --git a/amides/bin/validate_new_types.py b/amides/bin/validate_new_types.py index c8a52ac..b2d7892 100755 --- a/amides/bin/validate_new_types.py +++ b/amides/bin/validate_new_types.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +"""This script validates misuse classification models for other rule and event types. +""" + import sys import argparse diff --git a/amides/bin/validate_new_types_multi.py b/amides/bin/validate_new_types_multi.py index cd808ce..5be32d6 100644 --- a/amides/bin/validate_new_types_multi.py +++ b/amides/bin/validate_new_types_multi.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python3 + +"""This script validates rule attribution models for the other rule and event types. +""" + import argparse import re from pprint import pprint @@ -11,7 +16,9 @@ def main(): filename_regex = ".*multi_train_rslt_([a-z]+)_[0-9]+_[0-9]+.zip" category = re.compile(filename_regex).match(args.file).group(1) - evasion_values = open("data/generalisation/values_evasion_" + category).readlines() + evasion_values = open( + "data/generalisation/values_evasion_" + category, "r", encoding="utf-8" + ).readlines() normalizer = Normalizer(max_len_num_values=3) evasion_tokens = [normalizer.normalize(sample) for sample in evasion_values] diff --git a/amides/classification.sh b/amides/classification.sh new file mode 100755 index 0000000..1007765 --- /dev/null +++ b/amides/classification.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Creating misuse classification results for 'process_creation' rules using SOCBED data +python3 bin/train.py --config bin/config/process_creation/train_misuse_svc_rules.json +python3 bin/train.py --config bin/config/process_creation/train_misuse_svc_matches.json +python3 bin/validate.py --config bin/config/process_creation/validate_misuse_svc_rules.json +python3 bin/validate.py --config bin/config/process_creation/validate_misuse_svc_matches.json +python3 bin/eval_mcc_scaling.py --config bin/config/process_creation/eval_misuse_svc_rules.json +python3 bin/eval_mcc_scaling.py --config bin/config/process_creation/eval_misuse_svc_matches.json +python3 bin/plot_pr.py --config bin/config/process_creation/prt_plot_misuse_rules_matches.json diff --git a/amides/classification_other_types.sh b/amides/classification_other_types.sh new file mode 100755 index 0000000..eca7287 --- /dev/null +++ b/amides/classification_other_types.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Creating misuse classification results for 'powershell', 'proxy_web', and 'registry' rules +python3 bin/train_new_types.py --config bin/config/powershell/train_misuse_svc_rules.json +python3 bin/train_new_types.py --config bin/config/proxy_web/train_misuse_svc_rules.json +python3 bin/train_new_types.py --config bin/config/registry/train_misuse_svc_rules.json + +python3 bin/validate_new_types.py --config bin/config/powershell/validate_misuse_svc_rules.json +python3 bin/validate_new_types.py --config bin/config/proxy_web/validate_misuse_svc_rules.json +python3 bin/validate_new_types.py --config bin/config/registry/validate_misuse_svc_rules.json + +python3 bin/eval_mcc_scaling.py --config bin/config/powershell/eval_misuse_svc_rules.json +python3 bin/eval_mcc_scaling.py --config bin/config/proxy_web/eval_misuse_svc_rules.json +python3 bin/eval_mcc_scaling.py --config bin/config/registry/eval_misuse_svc_rules.json +python3 bin/plot_pr.py --config bin/config/pr_plot_powershell_proxy_registry.json diff --git a/amides/experiments.sh b/amides/experiments.sh new file mode 100755 index 0000000..e68ff75 --- /dev/null +++ b/amides/experiments.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +echo "########## Creating results for misuse classification (C1, C2) ##########" +./classification.sh + +echo "########## Creating results for rule attribution (C3) ##########" +./rule_attribution.sh + +echo "########## Creating results for tainted training data (C4) ##########" +./tainted_training.sh + +echo "########## Creating results for other rule types (C5) ##########" +./classification_other_types.sh diff --git a/amides/requirements.in b/amides/requirements.in index 55fdbfa..4faada0 100644 --- a/amides/requirements.in +++ b/amides/requirements.in @@ -1,10 +1,10 @@ scikit-learn>=1.2.0 numpy -joblib +joblib>=1.2.0 matplotlib luqum scipy>=1.9.2 -pandas +pandas>=1.5.3 seaborn elasticsearch ndjson diff --git a/amides/requirements.txt b/amides/requirements.txt index 4a3482d..841c8b5 100644 --- a/amides/requirements.txt +++ b/amides/requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --resolver=backtracking requirements.in +# pip-compile requirements.in # certifi==2022.12.7 # via elastic-transport diff --git a/amides/requirements_dev.in b/amides/requirements_dev.in new file mode 100644 index 0000000..0bf65a4 --- /dev/null +++ b/amides/requirements_dev.in @@ -0,0 +1,3 @@ +-r requirements.in +pytest +pytest-cov diff --git a/amides/requirements_dev.txt b/amides/requirements_dev.txt new file mode 100644 index 0000000..04bb73f --- /dev/null +++ b/amides/requirements_dev.txt @@ -0,0 +1,99 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile requirements_dev.in +# +certifi==2023.7.22 + # via elastic-transport +contourpy==1.1.1 + # via matplotlib +coverage[toml]==7.3.2 + # via + # coverage + # pytest-cov +cycler==0.12.1 + # via matplotlib +elastic-transport==8.4.1 + # via elasticsearch +elasticsearch==8.10.1 + # via -r requirements.in +exceptiongroup==1.1.3 + # via pytest +fonttools==4.43.1 + # via matplotlib +iniconfig==2.0.0 + # via pytest +joblib==1.3.2 + # via + # -r requirements.in + # scikit-learn +kiwisolver==1.4.5 + # via matplotlib +luqum==0.13.0 + # via -r requirements.in +matplotlib==3.8.0 + # via + # -r requirements.in + # seaborn +ndjson==0.3.1 + # via -r requirements.in +numpy==1.26.1 + # via + # -r requirements.in + # contourpy + # matplotlib + # pandas + # scikit-learn + # scipy + # seaborn +packaging==23.2 + # via + # matplotlib + # pytest +pandas==2.1.1 + # via + # -r requirements.in + # seaborn +pillow==10.1.0 + # via matplotlib +pluggy==1.3.0 + # via pytest +ply==3.11 + # via luqum +pyparsing==3.1.1 + # via matplotlib +pytest==7.4.2 + # via + # -r requirements_dev.in + # pytest-cov +pytest-cov==4.1.0 + # via -r requirements_dev.in +python-dateutil==2.8.2 + # via + # matplotlib + # pandas +pytz==2023.3.post1 + # via pandas +pyyaml==6.0.1 + # via -r requirements.in +scikit-learn==1.3.1 + # via -r requirements.in +scipy==1.11.3 + # via + # -r requirements.in + # scikit-learn +seaborn==0.13.0 + # via -r requirements.in +six==1.16.0 + # via python-dateutil +threadpoolctl==3.2.0 + # via scikit-learn +tomli==2.0.1 + # via + # coverage + # pytest +tzdata==2023.3 + # via pandas +urllib3==1.26.18 + # via elastic-transport diff --git a/amides/rule_attribution.sh b/amides/rule_attribution.sh new file mode 100755 index 0000000..c8b3263 --- /dev/null +++ b/amides/rule_attribution.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# Creating rule attribution results for 'process_creation' rules using SOCBED data +cat models/process_creation/train_rslt_misuse_svc_rules_f1_0_info.json | jq .estimator_params > bin/config/process_creation/params.json +python3 bin/train.py --config bin/config/process_creation/train_attr_svc_rules.json +python3 bin/eval_attr.py --config bin/config/process_creation/eval_attr.json +python3 bin/plot_attr.py --config bin/config/process_creation/attr_plot.json diff --git a/amides/setup.py b/amides/setup.py index fd7f8ff..1c36613 100644 --- a/amides/setup.py +++ b/amides/setup.py @@ -1,8 +1,16 @@ from setuptools import setup, find_packages + +with open("requirements.in", encoding="utf-8", mode="r") as f: + requirements = f.read().splitlines() + setup( name="amides", version="0.1", + description="Amides package contains proof-of-concept implementation of the Adaptive Misuse Detection System (AMIDES).", + url="https://github.com/fkie-cad/amides", + license="GPL-3.0 license", packages=find_packages(), - description="Amides package contains proof-of-concept implementation of the Adaptive Misuse Detection System (AMiDeS)." + install_requires=["setuptools"] + requirements, + python_requires=">=3.10", ) diff --git a/amides/tainted_training.sh b/amides/tainted_training.sh new file mode 100755 index 0000000..00cafd0 --- /dev/null +++ b/amides/tainted_training.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Creating tainted classification results for 'process_creation' events using SOCBED data +python3 bin/train.py --config bin/config/process_creation/train_misuse_svc_rules_tainted_10.json +python3 bin/train.py --config bin/config/process_creation/train_misuse_svc_rules_tainted_20.json +python3 bin/train.py --config bin/config/process_creation/train_misuse_svc_rules_tainted_30.json +python3 bin/validate.py --config bin/config/process_creation/validate_misuse_svc_rules_tainted_10.json +python3 bin/validate.py --config bin/config/process_creation/validate_misuse_svc_rules_tainted_20.json +python3 bin/validate.py --config bin/config/process_creation/validate_misuse_svc_rules_tainted_30.json +python3 bin/eval_mcc_scaling.py --config bin/config/process_creation/eval_misuse_svc_rules_tainted.json +python3 bin/plot_multi_tainted.py --config bin/config/process_creation/pr_plot_tainted.json diff --git a/amides/tests/data/collect_matches_evasions.py b/amides/tests/data/collect_matches_evasions.py deleted file mode 100755 index 6d6132c..0000000 --- a/amides/tests/data/collect_matches_evasions.py +++ /dev/null @@ -1,89 +0,0 @@ -#! /usr/bin/env python3 - -import sys -import os -import logging -import json -import argparse - -from amides.sigma import RuleSetDataset -from amides.events import benign_events_cache - -base_dir = os.path.realpath(os.path.join(os.path.dirname(__file__), "../../../")) - -benign_events_dir = os.path.join(base_dir, "Daten/2021-02-05-socbed/split") -sigma_dir = os.path.join(base_dir, "Daten/Sigma-Studie") -pc_events_dir = os.path.join(sigma_dir, "events/windows/process_creation") -pc_rules_dir = os.path.join(sigma_dir, "rules/windows/process_creation") -out_dir = None - -logger = logging.getLogger() - - -def load_events_and_pc_rules_data(): - try: - pc_rules_dataset = RuleSetDataset(pc_events_dir, pc_rules_dir) - return pc_rules_dataset - except FileNotFoundError as err: - logger.err(err) - sys.exit(1) - - -def save_event(event, counter): - file_name = f"{counter:06d}.json" - - with open(os.path.join(out_dir, file_name), "w") as out_file: - json.dump(event, out_file, indent=4) - - -def save_matches_and_evasions(pc_rule_set_data): - try: - os.makedirs(out_dir, exist_ok=True) - - rule_datasets = pc_rule_set_data.rule_datasets.values() - i = 1 - for rule_dataset in rule_datasets: - for match in rule_dataset.matching_events.data: - save_event(match, i) - i += 1 - - for evasion in rule_dataset.evasive_events.data: - save_event(evasion, i) - i += 1 - - except OSError as err: - logger.error(err) - sys.exit(1) - - -def collect_matches_and_evasions(): - pc_rule_set_data = load_events_and_pc_rules_data() - save_matches_and_evasions(pc_rule_set_data) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("out_dir", type=str, action="store", - help="Path to the output directory") - parser.add_argument("sigma_dir", type=str, nargs="?", action="store", - help="Path to the directory where sigma_data is located") - - args = parser.parse_args() - - if args.sigma_dir: - global sigma_dir - sigma_dir = args.sigma_dir - - if args.out_dir: - global out_dir - out_dir = args.out_dir - else: - logger.error("No output directory specified. Exiting") - sys.exit(1) - - collect_matches_and_evasions() - - - -if __name__ == "__main__": - main() diff --git a/amides/tests/data/json_to_jsonl.py b/amides/tests/data/json_to_jsonl.py old mode 100755 new mode 100644 diff --git a/amides/tests/data/scale_events.py b/amides/tests/data/scale_events.py old mode 100755 new mode 100644 diff --git a/amides/tests/data/sigma-study/events/proxy/rule_1/Proxy_Evasion_url_full_01.json b/amides/tests/data/sigma-study/events/proxyweb/rule_1/Proxy_Evasion_url_full_01.json similarity index 100% rename from amides/tests/data/sigma-study/events/proxy/rule_1/Proxy_Evasion_url_full_01.json rename to amides/tests/data/sigma-study/events/proxyweb/rule_1/Proxy_Evasion_url_full_01.json diff --git a/amides/tests/data/sigma-study/events/proxy/rule_1/Proxy_Evasion_url_full_02.json b/amides/tests/data/sigma-study/events/proxyweb/rule_1/Proxy_Evasion_url_full_02.json similarity index 100% rename from amides/tests/data/sigma-study/events/proxy/rule_1/Proxy_Evasion_url_full_02.json rename to amides/tests/data/sigma-study/events/proxyweb/rule_1/Proxy_Evasion_url_full_02.json diff --git a/amides/tests/data/sigma-study/events/proxy/rule_1/Proxy_Evasion_url_full_03.json b/amides/tests/data/sigma-study/events/proxyweb/rule_1/Proxy_Evasion_url_full_03.json similarity index 100% rename from amides/tests/data/sigma-study/events/proxy/rule_1/Proxy_Evasion_url_full_03.json rename to amides/tests/data/sigma-study/events/proxyweb/rule_1/Proxy_Evasion_url_full_03.json diff --git a/amides/tests/data/sigma-study/events/proxy/rule_1/Proxy_Evasion_url_full_04.json b/amides/tests/data/sigma-study/events/proxyweb/rule_1/Proxy_Evasion_url_full_04.json similarity index 100% rename from amides/tests/data/sigma-study/events/proxy/rule_1/Proxy_Evasion_url_full_04.json rename to amides/tests/data/sigma-study/events/proxyweb/rule_1/Proxy_Evasion_url_full_04.json diff --git a/amides/tests/data/sigma-study/events/proxy/rule_1/properties.yml b/amides/tests/data/sigma-study/events/proxyweb/rule_1/properties.yml similarity index 100% rename from amides/tests/data/sigma-study/events/proxy/rule_1/properties.yml rename to amides/tests/data/sigma-study/events/proxyweb/rule_1/properties.yml diff --git a/amides/tests/data/sigma-study/rules/extract_proc_cmdline.py b/amides/tests/data/sigma-study/rules/extract_proc_cmdline.py deleted file mode 100755 index d9f0a7c..0000000 --- a/amides/tests/data/sigma-study/rules/extract_proc_cmdline.py +++ /dev/null @@ -1,67 +0,0 @@ -#! /usr/bin/env python3 - -import os -import re -import argparse -import json - -from ruamel.yaml import YAML -from ruamel.yaml.composer import ComposerError - -def read_yaml_file(file_path): - try: - yaml = YAML(typ="safe") - with open(file_path, "r") as f: - rule = yaml.load(f) - return rule - except ComposerError as exc: - print(f"Error: {exc}") - return None - - -def get_rule_filter(rule_dict): - proc_cmdline_regex = r"process\.command_line:\s" - - rule_filter = rule_dict.get("filter", None) - if rule_filter and re.search(proc_cmdline_regex, rule_filter): - return rule_filter - return None - - -def write_to_file(rule_filters, out_file): - with open(out_file, "w") as f: - json.dump(rule_filters, f, indent=1, sort_keys=True) - - -def show_results(rule_filters): - print(json.dumps(rule_filters, indent=1, sort_keys=True)) - - -def extract_process_cmdline(rules_path, out_file=None): - rule_filters = {} - - for rule_name in os.listdir(rules_path): - rule_path = os.path.join(rules_path, rule_name) - if os.path.isfile(rule_path) and (rule_name.endswith(".yaml") or rule_name.endswith(".yml")): - rule_dict = read_yaml_file(rule_path) - if rule_dict: - filter = get_rule_filter(rule_dict) - if filter: - rule_filters[rule_name] = filter - - if out_file: - write_to_file(rule_filters, out_file) - else: - show_results(rule_filters) - - -if __name__ == "__main__": - argparser = argparse.ArgumentParser() - argparser.add_argument("-r", "--rules-path", action="store", - help="Path to process_creation rules directory") - argparser.add_argument("-o", "--out-file", action="store", - help="Output file for results") - args = argparser.parse_args() - - if args.rules_path: - extract_process_cmdline(args.rules_path, args.out_file) diff --git a/amides/tests/data/sigma-study/rules/proxy/rule_1.yml b/amides/tests/data/sigma-study/rules/proxyweb/rule_1.yml similarity index 100% rename from amides/tests/data/sigma-study/rules/proxy/rule_1.yml rename to amides/tests/data/sigma-study/rules/proxyweb/rule_1.yml diff --git a/amides/tests/data/socbed-sample/powershell/jsonl/Microsoft-Windows-PowerShell_4104.jsonl b/amides/tests/data/socbed-sample/powershell/jsonl/Microsoft-Windows-PowerShell_4104.jsonl index 88d8b07..2ec1024 100644 --- a/amides/tests/data/socbed-sample/powershell/jsonl/Microsoft-Windows-PowerShell_4104.jsonl +++ b/amides/tests/data/socbed-sample/powershell/jsonl/Microsoft-Windows-PowerShell_4104.jsonl @@ -26,5 +26,5 @@ {"Keyless": "$global:?"} {"Keyless": "$reg_path = 'HKLM:\\Software\\Microsoft\\Windows NT\\CurrentVersion\\Winlogon'; Set-ItemProperty -Path $reg_path -Name 'AutoAdminLogon' -Value '1'; Set-ItemProperty -Path $reg_path -Name 'DefaultDomainName' -Value 'BREACH'; Set-ItemProperty -Path $reg_path -Name 'DefaultPassword' -Value 'breach'; Set-ItemProperty -Path $reg_path -Name 'DefaultUserName' -Value 'client2'"} {"Keyless": "$global:?"} -{"Keyless": "$admin_secpwd = ConvertTo-SecureString 'breach' -AsPlainText -Force; $credential = New-Object -TypeName System.Management.Automation.PSCredential -ArgumentList 'BREACH\Administrator', $admin_secpwd; Add-Computer -Credential $credential -DomainName 'BREACH' -Restart -Force"} +{"Keyless": "$admin_secpwd = ConvertTo-SecureString 'breach' -AsPlainText -Force; $credential = New-Object -TypeName System.Management.Automation.PSCredential -ArgumentList 'BREACH\\Administrator', $admin_secpwd; Add-Computer -Credential $credential -DomainName 'BREACH' -Restart -Force"} {"Keyless": "$global:?"} diff --git a/amides/tests/unit/test_data.py b/amides/tests/unit/test_data.py index 9bd60d3..2fcc3db 100644 --- a/amides/tests/unit/test_data.py +++ b/amides/tests/unit/test_data.py @@ -322,7 +322,6 @@ def test_create_info_dict(self): "positive_negative_ratio": 0.5, }, }, - "feature_extractors": [], "name": "tt_split_text", } @@ -488,7 +487,6 @@ def test_create_info_dict(self): "positive_negative_ratio": 0.5, }, }, - "feature_extractors": [], "name": "ttv_split_text", } diff --git a/amides/tests/unit/test_evaluation.py b/amides/tests/unit/test_evaluation.py deleted file mode 100644 index 6e53713..0000000 --- a/amides/tests/unit/test_evaluation.py +++ /dev/null @@ -1,57 +0,0 @@ -import pytest -import numpy as np - -from amides.evaluation import BinaryEvaluationResult - - -class TestBinaryClfEvaluationResult: - def test_default_name(self): - expected = "eval_rslt_dec_values_10" - binary_result = BinaryClfEvaluationResult( - np.linspace(0, 1, num=10), metric="dec_values" - ) - - assert binary_result.name == expected - - def test_file_name_default_name(self): - expected = "eval_rslt_dec_values_10_19700101_000000" - binary_result = BinaryClfEvaluationResult( - np.linspace(0, 1, num=10), metric="dec_values", timestamp="19700101_000000" - ) - - assert binary_result.file_name() == expected - - def test_file_name_custom_name(self): - expected = "eval_rslt_dec_values_10_19700101_000000" - binary_result = BinaryClfEvaluationResult( - np.linspace(0, 1, num=10), - metric="dec_values", - timestamp="19700101_000000", - name="svc_rules", - ) - - assert binary_result.file_name() == "eval_rslt_svc_rules_19700101_000000" - - def test_create_info_dict(self): - expected = { - "name": "svc_rules", - "timestamp": "19700101_000000", - "metric": "dec_values", - "thresholds": { - "num_thresholds": 10, - "min_threshold_value": 0.0, - "max_threshold_value": 1.0, - }, - "max_f1_score": 1.0, - "max_precision": 1.0, - "max_recall": 1.0, - "optimal_threshold_value": 0, - } - binary_result = BinaryClfEvaluationResult( - np.linspace(0, 1, num=10), - metric="dec_values", - timestamp="19700101_000000", - name="svc_rules", - ) - - assert binary_result.create_info_dict() == expected diff --git a/amides/tests/unit/test_events.py b/amides/tests/unit/test_events.py index 5768c32..085225c 100644 --- a/amides/tests/unit/test_events.py +++ b/amides/tests/unit/test_events.py @@ -37,9 +37,9 @@ def test_init(self): assert Events("proxy_web", EventType.PROXY_WEB) event_paths = [ - (pc_events_json_path, 20), - (pc_events_jsonl_path, 20), - (powershell_events_jsonl_path, 30), + (pc_events_json_path(), 20), + (pc_events_jsonl_path(), 20), + (powershell_events_jsonl_path(), 30), ] @pytest.mark.parametrize("events_path,num_events", event_paths) @@ -61,12 +61,6 @@ def test_load_from_dir_invalid_events_path(self): with pytest.raises(FileNotFoundError): events.load_from_dir("some/sample/path") - def test_load_from_dir_event_type_mismatch(self): - events = Events(EventType.PROCESS_CREATION) - events.load_from_dir(powershell_events_jsonl_path()) - - assert not events.data - def test_add_event(self): event = {"winlog": {"event_id": 1}} events = Events(EventType.PROCESS_CREATION) @@ -79,7 +73,7 @@ def test_add_event(self): @pytest.mark.parametrize("split_sizes", split_sizes) def test_create_random_split_raising_typerrror(self, split_sizes): events = Events(EventType.PROCESS_CREATION) - events.load_from_dir(pc_events_jsonl_path) + events.load_from_dir(pc_events_jsonl_path()) with pytest.raises(TypeError): _ = events.create_random_split(split_sizes=split_sizes) @@ -149,7 +143,10 @@ def test_add_events(self): events_cache.add_events(registry_events) events_cache.add_events(proxy_events) - assert "test" in events_cache.events + assert "process_creation" in events_cache.events + assert "powershell" in events_cache.events + assert "registry" in events_cache.events + assert "proxy_web" in events_cache.events def test_add_events_existing_name(self): events_1 = Events(EventType.PROCESS_CREATION, name="test") diff --git a/amides/tests/unit/test_extraction.py b/amides/tests/unit/test_extraction.py index 88395c5..b6dbe92 100644 --- a/amides/tests/unit/test_extraction.py +++ b/amides/tests/unit/test_extraction.py @@ -14,10 +14,8 @@ Split, WhitespaceAsterisk, WhitespaceAsteriskSlashMinus, - AnyWordCharacter, ) -from amides.features.preprocessing import FilterDummyCharacters, Lowercase from amides.features.filter import NumericValues, Strings diff --git a/amides/tests/unit/test_models_selection.py b/amides/tests/unit/test_models_selection.py index 28970cf..2dfb3fb 100644 --- a/amides/tests/unit/test_models_selection.py +++ b/amides/tests/unit/test_models_selection.py @@ -10,91 +10,86 @@ from amides.models.selection import HyperParameterOptimizer -@pytest.fixture def train_data(): train_data = [ "some-commandline", "some-other-commandline", "this-commandline", - "that-commandline" + "that-commandline", ] c_vect = CountVectorizer() train_data_transformed = c_vect.fit_transform(train_data).toarray() - return DataBunch(np.array(train_data_transformed), np.array([0, 1, 1, 0]), - ["benign", "matching"]) + return DataBunch( + np.array(train_data_transformed), np.array([0, 1, 1, 0]), ["benign", "matching"] + ) -@pytest.fixture -def train_test_split(train_data): + +def train_test_split(): test_data = [ "more-commandline", "even-more-commandline", "wow-commandline", - "such-commandline" + "such-commandline", ] train_data = [ "some-commandline", "some-other-commandline", "this-commandline", - "that-commandline" + "that-commandline", ] c_vect = CountVectorizer() train_data_transformed = c_vect.fit_transform(train_data).toarray() test_data_transformed = c_vect.transform(test_data).toarray() - test_data = DataBunch(test_data_transformed, np.array([1, 0, 1, 0]), - ["benign", "matching"]) - train_data = DataBunch(train_data_transformed, np.array([0, 1, 1, 0]), - ["benign", "matching"]) + test_data = DataBunch( + test_data_transformed, np.array([1, 0, 1, 0]), ["benign", "matching"] + ) + train_data = DataBunch( + train_data_transformed, np.array([0, 1, 1, 0]), ["benign", "matching"] + ) return TrainTestSplit(train_data, test_data, name="some_rule") -class TestHParamOptimizer: +class TestHParamOptimizer: @pytest.fixture def hp_optimizer(self): return HyperParameterOptimizer( estimator=SVC(), param_grid={ "C": np.logspace(-1, 1, num=1), - "gamma": np.logspace(1, 5, num=1) + "gamma": np.logspace(1, 5, num=1), }, search_method=GridSearchCV, - cv_schema=2) + cv_schema=2, + ) def test_init(self): optimizer = HyperParameterOptimizer( estimator=SVC(), param_grid={ "C": np.logspace(-1, 1, num=1), - "gamma": np.logspace(1, 5, num=1) + "gamma": np.logspace(1, 5, num=1), }, search_method=GridSearchCV, - cv_schema=2) + cv_schema=2, + ) assert optimizer - def test_init_invalid_param_grid(self): - with pytest.raises((ValueError, AttributeError)): - _ = HyperParameterOptimizer( - estimator=SVC(), - param_grid="invalid", - search_method=GridSearchCV, - cv_schema=2 - ) - def test_init_invalid_search_method(self): with pytest.raises(TypeError): _ = HyperParameterOptimizer( estimator=SVC(), param_grid={ "C": np.logspace(-1, 1, num=1), - "gamma": np.logspace(1, 5, num=1) + "gamma": np.logspace(1, 5, num=1), }, search_method="invalid", - cv_schema=2 + cv_schema=2, ) - def test_best_parameters_not_fitted(self, hp_optimizer): + def test_best_parameters_not_fitted(self, hp_optimizer): with pytest.raises(NotFittedError): _ = hp_optimizer.best_parameters @@ -106,44 +101,40 @@ def test_best_score_not_fitted(self, hp_optimizer): with pytest.raises(NotFittedError): _ = hp_optimizer.best_parameters - def test_search_best_parameters(self, train_data): + def test_search_best_parameters(self): optimizer = HyperParameterOptimizer( estimator=SVC(), param_grid={ "C": np.logspace(-1, 1, num=1), - "gamma": np.logspace(1, 5, num=1) + "gamma": np.logspace(1, 5, num=1), }, search_method=GridSearchCV, - cv_schema=2 + cv_schema=2, ) - optimizer.search_best_parameters(train_data) + optimizer.search_best_parameters(train_data()) assert optimizer.best_estimator assert optimizer.best_parameters assert optimizer.best_score - def test_search_best_parameters_invalid_cv_schema(self, train_data): + def test_search_best_parameters_invalid_cv_schema(self): optimizer = HyperParameterOptimizer( - estimator=SVC(), - param_grid={ - "C": np.logspace(-1, 1, num=1), - "gamma": np.logspace(1, 5, num=1) - }, - search_method=GridSearchCV, - cv_schema="invalid" - ) + estimator=SVC(), + param_grid={ + "C": np.logspace(-1, 1, num=1), + "gamma": np.logspace(1, 5, num=1), + }, + search_method=GridSearchCV, + cv_schema="invalid", + ) with pytest.raises(ValueError): - optimizer.search_best_parameters(train_data) + optimizer.search_best_parameters(train_data()) def test_search_best_parameters_invalid_input(self, hp_optimizer): with pytest.raises(ValueError): hp_optimizer.search_best_parameters("invalid") - def test_search_and_predict(self, hp_optimizer, train_test_split): - predict = hp_optimizer.search_and_predict(train_test_split) + def test_search_and_predict(self, hp_optimizer): + predict = hp_optimizer.search_and_predict(train_test_split()) assert len(predict) == 4 - - def test_search_and_predict_invalid_input(self, hp_optimizer): - with pytest.raises(ValueError): - _ = hp_optimizer.search_and_predict(train_test_split) diff --git a/amides/tests/unit/test_persist.py b/amides/tests/unit/test_persist.py index 333fe37..24cd646 100644 --- a/amides/tests/unit/test_persist.py +++ b/amides/tests/unit/test_persist.py @@ -17,7 +17,6 @@ TrainingResult, ValidationResult, ) -from amides.models.baseline.baseline import BaselineClassifier @pytest.fixture @@ -60,15 +59,15 @@ def ttv_split(self): @pytest.fixture def train_result(self, tt_split): return TrainingResult( - SVC(), tt_split, name="some_train_result", timestamp="19700101_000000" + SVC(), data=tt_split, name="some_train_result", timestamp="19700101_000000" ) @pytest.fixture def valid_result(self, tt_split): return ValidationResult( SVC(), - tt_split, - np.array([1, 0, 0]), + data=tt_split, + predict=np.array([1, 0, 0]), name="some_valid_result", timestamp="19700101_000000", ) @@ -79,8 +78,8 @@ def multi_result(self, tt_split): multi_rslt.add_result( ValidationResult( SVC(), - tt_split, - np.array([1, 0, 0]), + data=tt_split, + predict=np.array([1, 0, 0]), name="some_rule", timestamp="19700101_000000", ) @@ -88,8 +87,8 @@ def multi_result(self, tt_split): multi_rslt.add_result( ValidationResult( SVC(), - tt_split, - np.array([1, 0, 0]), + data=tt_split, + predict=np.array([1, 0, 0]), name="another_rule", timestamp="19700101_000000", ) @@ -97,20 +96,6 @@ def multi_result(self, tt_split): return multi_rslt - @pytest.fixture - def baseline_clf(self): - return BaselineClassifier( - { - "remove_escape_characters": True, - "delete_whitespaces": True, - "modify_exe": True, - "swap_slash_minus": True, - "swap_minus_slash": True, - }, - name="some_base_clf", - timestamp="19700101_000000", - ) - def test_init(self, dump_dir): dumper = Dumper(dump_dir) assert dumper @@ -165,19 +150,8 @@ def test_save_validation_result(self, dump_dir, valid_result): assert entries[0] == f"{expected_filename}.zip" assert entries[1] == f"{expected_filename}_info.json" - def test_save_calibration_result(self, dump_dir, calib_result): - expected_filename = "calib_rslt_some_calib_result_19700101_000000" - - dumper = Dumper(dump_dir) - dumper.save_object(calib_result) - - entries = sorted(os.listdir(dump_dir)) - assert len(entries) == 2 - assert entries[0] == f"{expected_filename}.zip" - assert entries[1] == f"{expected_filename}_info.json" - def test_save_multi_result(self, dump_dir, multi_result): - expected_filename = "multi_rslt_some_rules_19700101_000000" + expected_filename = "multi_train_rslt_some_rules_19700101_000000" dumper = Dumper(dump_dir) dumper.save_object(multi_result) @@ -187,17 +161,6 @@ def test_save_multi_result(self, dump_dir, multi_result): assert entries[0] == f"{expected_filename}.zip" assert entries[1] == f"{expected_filename}_info.json" - def test_save_baseline_clf(self, dump_dir, baseline_clf): - expected_filename = "baseline_clf_some_base_clf_19700101_000000" - - dumper = Dumper(dump_dir) - dumper.save_object(baseline_clf) - - entries = sorted(os.listdir(dump_dir)) - assert len(entries) == 2 - assert entries[0] == f"{expected_filename}.zip" - assert entries[1] == f"{expected_filename}_info.json" - def test_save_no_output_path(self, tmpdir, tt_split): with tmpdir.as_cwd() as cwd: dumper = Dumper() @@ -283,27 +246,6 @@ def test_load_validation_result(self, dump_dir, valid_result): result.data.test_data.labels, valid_result.data.test_data.labels ) - def test_load_calibration_result(self, dump_dir, calib_result): - dumper = Dumper(dump_dir) - dumper.save_object(calib_result) - - entries = sorted(os.listdir(dump_dir)) - - result = dumper.load_object(entries[0]) - assert type(result) is CalibrationResult - assert np.array_equal( - result.data.train_data.samples, calib_result.data.train_data.samples - ) - assert np.array_equal( - result.data.train_data.labels, calib_result.data.train_data.labels - ) - assert np.array_equal( - result.data.test_data.samples, calib_result.data.test_data.samples - ) - assert np.array_equal( - result.data.test_data.labels, calib_result.data.test_data.labels - ) - def test_load_multi_result(self, dump_dir, multi_result): dumper = Dumper(dump_dir) dumper.save_object(multi_result) @@ -345,16 +287,6 @@ def test_load_multi_result(self, dump_dir, multi_result): result_2.data.test_data.labels, expected_result_2.data.test_data.labels ) - def test_load_baseline_clf(self, dump_dir, baseline_clf): - dumper = Dumper(dump_dir) - dumper.save_object(baseline_clf) - - entries = sorted(os.listdir(dump_dir)) - - result = dumper.load_object(entries[0]) - assert type(result) is BaselineClassifier - assert result.modifier_mask == 0 - def test_load_object_from_dir_path(self, dump_dir): dumper = Dumper() @@ -464,7 +396,7 @@ def test_write_non_existing_file(self, tmp_path): writer = EventWriter( str(test_out), "2023-06-01T00:00:00", "2023-06-01T01:00:00" ) - writer.write(events, 0) + writer.write(events) assert test_file.is_file() with test_file.open("r", encoding="utf-8") as test_file: @@ -492,7 +424,7 @@ def test_write_add_to_existing_file(self, tmp_path): writer = EventWriter( str(test_out), "2023-06-01T00:00:00", "2023-06-01T01:00:00" ) - writer.write(events[1:], 0) + writer.write(events[1:]) with test_file.open("r", encoding="utf-8") as in_file: result = in_file.read() @@ -519,7 +451,7 @@ def test_read_last_file(self, tmp_path): ] expected = set([f"{json.dumps(event)}\n" for event in events]) - writer.write(events, 0) + writer.write(events) result = writer.read_last_file() assert expected == result @@ -538,7 +470,7 @@ def test_write_non_existing_file(self, tmp_path): writer = EventCompressor( str(test_out), "2023-06-01T00:00:00", "2023-06-01T01:00:00" ) - writer.write(events, 0) + writer.write(events) assert test_archive.is_file() with ZipFile(str(test_archive), mode="r") as my_zip: @@ -578,7 +510,7 @@ def test_write_add_to_existing_file(self, tmp_path): writer = EventCompressor( str(test_out), "2023-06-01T00:00:00", "2023-06-01T01:00:00" ) - writer.write(events[1:], 0) + writer.write(events[1:]) with ZipFile(str(test_archive), mode="r") as my_zip: with TextIOWrapper( @@ -608,6 +540,6 @@ def test_read_last_file(self, tmp_path): ] expected = set([f"{json.dumps(event)}\n" for event in events]) - writer.write(events, 0) + writer.write(events) result = writer.read_last_file() assert expected == result diff --git a/amides/tests/unit/test_result.py b/amides/tests/unit/test_result.py index 176c405..81fda59 100644 --- a/amides/tests/unit/test_result.py +++ b/amides/tests/unit/test_result.py @@ -10,8 +10,8 @@ class TestTrainingResult: def test_default_name(self): train_rslt = TrainingResult( - SVC(), - TrainTestValidSplit( + estimator=SVC(), + data=TrainTestValidSplit( DataBunch(np.array(["other", "training", "data"]), np.array([0, 1, 0])), DataBunch(np.array(["other", "testing", "data"]), np.array([0, 1, 0])), DataBunch( @@ -24,8 +24,8 @@ def test_default_name(self): def test_file_name_default_name(self): train_rslt = TrainingResult( - SVC(), - TrainTestValidSplit( + estimator=SVC(), + data=TrainTestValidSplit( DataBunch(np.array(["other", "training", "data"]), np.array([0, 1, 0])), DataBunch(np.array(["other", "testing", "data"]), np.array([0, 1, 0])), DataBunch( @@ -39,8 +39,8 @@ def test_file_name_default_name(self): def test_file_name_no_default_name(self): train_rslt = TrainingResult( - SVC(), - TrainTestValidSplit( + estimator=SVC(), + data=TrainTestValidSplit( DataBunch(np.array(["other", "training", "data"]), np.array([0, 1, 0])), DataBunch(np.array(["other", "testing", "data"]), np.array([0, 1, 0])), DataBunch( @@ -85,16 +85,19 @@ def test_create_info_dict(self): "positive_negative_ratio": 0.5, }, }, - "feature_extractors": [], "name": "ttv_split", }, + "feature_extractors": None, "name": "sample_result", "timestamp": "20220518_111030", + "scaler": None, + "tainted_seed": 0, + "tainted_share": 0.0, } train_rslt = TrainingResult( - SVC(), - TrainTestValidSplit( + estimator=SVC(), + data=TrainTestValidSplit( DataBunch(np.array(["other", "training", "data"]), np.array([0, 1, 0])), DataBunch(np.array(["other", "testing", "data"]), np.array([0, 1, 0])), DataBunch( @@ -111,30 +114,30 @@ def test_create_info_dict(self): class TestValidationResult: def test_default_name(self): valid_rslt = ValidationResult( - SVC(), - TrainTestValidSplit( + estimator=SVC(), + data=TrainTestValidSplit( DataBunch(np.array(["other", "training", "data"]), np.array([0, 1, 0])), DataBunch(np.array(["other", "testing", "data"]), np.array([0, 1, 0])), DataBunch( np.array(["other", "validation", "data"]), np.array([0, 1, 0]) ), ), - np.array([1, 1, 0]), + predict=np.array([1, 1, 0]), ) assert valid_rslt.name == "valid_rslt_svc" def test_file_name_default_name(self): valid_rslt = ValidationResult( - SVC(), - TrainTestValidSplit( + estimator=SVC(), + data=TrainTestValidSplit( DataBunch(np.array(["other", "training", "data"]), np.array([0, 1, 0])), DataBunch(np.array(["other", "testing", "data"]), np.array([0, 1, 0])), DataBunch( np.array(["other", "validation", "data"]), np.array([0, 1, 0]) ), ), - np.array([1, 1, 0]), + predict=np.array([1, 1, 0]), timestamp="20220518_111030", ) @@ -142,15 +145,15 @@ def test_file_name_default_name(self): def test_file_name_no_default_name(self): valid_rslt = ValidationResult( - SVC(), - TrainTestValidSplit( + estimator=SVC(), + data=TrainTestValidSplit( DataBunch(np.array(["other", "training", "data"]), np.array([0, 1, 0])), DataBunch(np.array(["other", "testing", "data"]), np.array([0, 1, 0])), DataBunch( np.array(["other", "validation", "data"]), np.array([0, 1, 0]) ), ), - np.array([1, 1, 0]), + predict=np.array([1, 1, 0]), name="sample_result", timestamp="20220518_111030", ) @@ -189,23 +192,26 @@ def test_create_info_dict(self): "positive_negative_ratio": 0.5, }, }, - "feature_extractors": [], "name": "ttv_split", }, + "feature_extractors": None, "name": "sample_result", "timestamp": "20220518_111030", + "scaler": None, + "tainted_seed": 0, + "tainted_share": 0.0, } valid_rslt = ValidationResult( - SVC(), - TrainTestValidSplit( + estimator=SVC(), + data=TrainTestValidSplit( DataBunch(np.array(["other", "training", "data"]), np.array([0, 1, 0])), DataBunch(np.array(["other", "testing", "data"]), np.array([0, 1, 0])), DataBunch( np.array(["other", "validation", "data"]), np.array([0, 1, 0]) ), ), - np.array([1, 1, 0]), + predict=np.array([1, 1, 0]), name="sample_result", timestamp="20220518_111030", ) @@ -232,15 +238,15 @@ def test_file_name_custom_name(self): def test_add_result(self): multi_rslt = MultiTrainingResult(name="custom", timestamp="20220518_120000") valid_rslt = ValidationResult( - SVC(), - TrainTestValidSplit( + estimator=SVC(), + data=TrainTestValidSplit( DataBunch(np.array(["other", "training", "data"]), np.array([0, 1, 0])), DataBunch(np.array(["other", "testing", "data"]), np.array([0, 1, 0])), DataBunch( np.array(["other", "validation", "data"]), np.array([0, 1, 0]) ), ), - np.array([1, 1, 0]), + predict=np.array([1, 1, 0]), name="sample_result", timestamp="20220518_111030", ) @@ -285,26 +291,29 @@ def test_create_info_dict(self): "positive_negative_ratio": 0.5, }, }, - "feature_extractors": [], "name": "ttv_split", }, "name": "sample_result", "timestamp": "20220518_111030", + "scaler": None, + "tainted_seed": 0, + "tainted_share": 0.0, + "feature_extractors": None, } }, } multi_rslt = MultiTrainingResult(name="custom", timestamp="20220518_120000") valid_rslt = ValidationResult( - SVC(), - TrainTestValidSplit( + estimator=SVC(), + data=TrainTestValidSplit( DataBunch(np.array(["other", "training", "data"]), np.array([0, 1, 0])), DataBunch(np.array(["other", "testing", "data"]), np.array([0, 1, 0])), DataBunch( np.array(["other", "validation", "data"]), np.array([0, 1, 0]) ), ), - np.array([1, 1, 0]), + predict=np.array([1, 1, 0]), name="sample_result", timestamp="20220518_111030", ) diff --git a/amides/tests/unit/test_sigma.py b/amides/tests/unit/test_sigma.py index cb40e41..adf2660 100644 --- a/amides/tests/unit/test_sigma.py +++ b/amides/tests/unit/test_sigma.py @@ -5,10 +5,10 @@ from amides.sigma import ( extract_field_values_from_filter, - RuleDatasetError, RuleDataset, RuleSetDataset, RuleSetDatasetError, + RuleDatasetError, ) from amides.events import Events, EventType @@ -18,7 +18,7 @@ def data_path(): def sigma_path(): - return os.path.join(data_path, "sigma-study") + return os.path.join(data_path(), "sigma-study") def benign_pc_events(): @@ -26,39 +26,41 @@ def benign_pc_events(): data_path(), "socbed-sample/process_creation/jsonl" ) events = Events(EventType.PROCESS_CREATION, name="process_creation") + events.load_from_dir(benign_pc_events_path) - return events.load_from_dir(benign_pc_events_path) + return events def benign_powershell_events(): powershell_events_path = os.path.join(data_path(), "socbed-sample/powershell/jsonl") events = Events(EventType.POWERSHELL, name="powershell") + events.load_from_dir(powershell_events_path) - return events.load_from_dir(powershell_events_path) + return events def pc_events_path(): - return os.path.join(sigma_path, "events/windows/process_creation") + return os.path.join(sigma_path(), "events/windows/process_creation") def pc_rules_path(): - return os.path.join(sigma_path, "rules/windows/process_creation") + return os.path.join(sigma_path(), "rules/windows/process_creation") def powershell_events_path(): - return os.path.join(sigma_path, "events/windows/powershell") + return os.path.join(sigma_path(), "events/windows/powershell") def powershell_rules_path(): - return os.path.join(sigma_path, "rules/windows/powershell") + return os.path.join(sigma_path(), "rules/windows/powershell") def proxy_events_path(): - return os.path.join(sigma_path, "events/proxy") + return os.path.join(sigma_path(), "events/proxyweb") def proxy_rules_path(): - return os.path.join(sigma_path, "rules/proxy") + return os.path.join(sigma_path(), "rules/proxyweb") class TestMultiFieldVisitor: @@ -373,8 +375,8 @@ def test_extract_web_proxy_field_values_from_filter(self, rule_filter, expected) class TestRuleDataset: rules_data = [ ( - os.path.join(sigma_path, "rules/windows/process_creation/rule_1.yml"), - os.path.join(sigma_path, "events/windows/process_creation/rule_1"), + os.path.join(sigma_path(), "rules/windows/process_creation/rule_1.yml"), + os.path.join(sigma_path(), "events/windows/process_creation/rule_1"), "rule_1", ( "process.command_line: (" @@ -384,8 +386,8 @@ class TestRuleDataset: ), ), ( - os.path.join(sigma_path, "rules/windows/powershell/rule_1.yml"), - os.path.join(sigma_path, "events/windows/powershell/rule_1"), + os.path.join(sigma_path(), "rules/windows/powershell/rule_1.yml"), + os.path.join(sigma_path(), "events/windows/powershell/rule_1"), "rule_1", ( 'Keyless: "del (Get-PSReadlineOption).HistorySavePath" OR Keyless: "Set-PSReadlineOption ' @@ -394,8 +396,8 @@ class TestRuleDataset: ), ), ( - os.path.join(sigma_path, "rules/windows/registry/rule_1.yml"), - os.path.join(sigma_path, "events/windows/registry/rule_1"), + os.path.join(sigma_path(), "rules/windows/registry/rule_1.yml"), + os.path.join(sigma_path(), "events/windows/registry/rule_1"), "rule_1", ( 'winlog.event_data.TargetObject: ("*\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run\\\\*" ' @@ -406,10 +408,10 @@ class TestRuleDataset: ), ), ( - os.path.join(sigma_path, "rules/proxy/rule_1.yml"), - os.path.join(sigma_path, "events/proxy/rule_1"), + os.path.join(sigma_path(), "rules/proxyweb/rule_1.yml"), + os.path.join(sigma_path(), "events/proxyweb/rule_1"), "rule_1", - 'c-uri: "*/list/suc?name=*"', + 'url.full: "*/list/suc?name=*"', ), ] @@ -424,23 +426,35 @@ def test_load_events_and_filter( assert rule_data.evasions.size > 0 @pytest.mark.parametrize( - "matches_evasions_path", + "rule_path,matches_evasions_path", [ - "events/windows/process_creation/missing_evasions", - "events/windows/process_creation/missing_matches", + ( + os.path.join(sigma_path(), "rules/windows/process_creation/rule_1.yml"), + os.path.join( + sigma_path(), "events/windows/process_creation/missing_evasions" + ), + ), + ( + os.path.join(sigma_path(), "rules/windows/process_creation/rule_1.yml"), + os.path.join( + sigma_path(), "events/windows/process_creation/missing_matches" + ), + ), ], ) def test_load_events_and_filter_missing_matches_or_evasions( self, rule_path, matches_evasions_path ): - missing_events = os.path.join(sigma_path, matches_evasions_path) rule_data = RuleDataset() - rule_data.load_events_and_filter(missing_events, rule_path) + rule_data.load_events_and_filter(matches_evasions_path, rule_path) - def test_load_events_and_filter_missing_properties(self, rule_path): + def test_load_events_and_filter_missing_properties(self): missing_properties = os.path.join( - sigma_path, "events/windows/process_creation/missing_properties" + sigma_path(), "events/windows/process_creation/missing_properties" + ) + rule_path = os.path.join( + sigma_path(), "rules/windows/process_creation/rule_1.yml" ) rule_data = RuleDataset() @@ -449,8 +463,8 @@ def test_load_events_and_filter_missing_properties(self, rule_path): search_fields_values = [ ( - os.path.join(sigma_path, "events/windows/process_creation/rule_1"), - os.path.join(sigma_path, "rules/windows/process_creation/rule_1.yml"), + os.path.join(sigma_path(), "events/windows/process_creation/rule_1"), + os.path.join(sigma_path(), "rules/windows/process_creation/rule_1.yml"), ["process.command_line"], [ 'reg query \\"HKEY_CURRENT_USER\\Software\\Microsoft\\Terminal Server Client\\Default\\"', @@ -459,8 +473,8 @@ def test_load_events_and_filter_missing_properties(self, rule_path): ], ), ( - os.path.join(sigma_path, "events/windows/powershell/rule_1"), - os.path.join(sigma_path, "rules/windows/powershell/rule_1.yml"), + os.path.join(sigma_path(), "events/windows/powershell/rule_1"), + os.path.join(sigma_path(), "rules/windows/powershell/rule_1.yml"), ["Keyless"], [ "del (Get-PSReadlineOption).HistorySavePath", @@ -470,8 +484,8 @@ def test_load_events_and_filter_missing_properties(self, rule_path): ], ), ( - os.path.join(sigma_path, "events/windows/registry/rule_1"), - os.path.join(sigma_path, "rules/windows/registry/rule_1.yml"), + os.path.join(sigma_path(), "events/windows/registry/rule_1"), + os.path.join(sigma_path(), "rules/windows/registry/rule_1.yml"), ["winlog.event_data.Details", "winlog.event_data.TargetObject"], [ "*\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run\\\\*", @@ -488,8 +502,8 @@ def test_load_events_and_filter_missing_properties(self, rule_path): ], ), ( - os.path.join(sigma_path, "events/proxy/rule_1"), - os.path.join(sigma_path, "rules/proxy/rule_1.yml"), + os.path.join(sigma_path(), "events/proxyweb/rule_1"), + os.path.join(sigma_path(), "rules/proxyweb/rule_1.yml"), ["url.full"], ["*/list/suc?name=*"], ), @@ -512,21 +526,19 @@ def test_extract_fields_from_filter( events_paths = [ ( benign_pc_events(), - os.path.join(sigma_path, "events/windows/process_creation/rule_1"), - os.path.join(sigma_path, "rules/windows/process_creation/rule_1.yml"), - ), - ( - benign_powershell_events(), - os.path.join(sigma_path, "events/windows/powershell/rule_1"), - os.path.join(sigma_path, "rules/windows/powershell/rule_1.yml"), + os.path.join(sigma_path(), "events/windows/process_creation/rule_1"), + os.path.join(sigma_path(), "rules/windows/process_creation/rule_1.yml"), ), ] + @pytest.mark.parametrize( + "benign_events,matches_evasions_path,rule_path", events_paths + ) def test_create_matches_evasions_train_test_split( self, benign_events, - rule_path, matches_evasions_path, + rule_path, ): expected_train_size = expected_test_size = 23 rule_data = RuleDataset() @@ -544,59 +556,46 @@ def test_create_matches_evasions_train_test_split( events_search_fields = [ ( - os.path.join(sigma_path, "events/windows/process_creation/rule_1"), - os.path.join(sigma_path, "rules/windows/process_creation/rule_1.yml"), + benign_pc_events(), + os.path.join(sigma_path(), "events/windows/process_creation/rule_1"), + os.path.join(sigma_path(), "rules/windows/process_creation/rule_1.yml"), ["process.command_line"], ), ( - os.path.join(sigma_path, "events/windows/powershell/rule_1"), - os.path.join(sigma_path, "rules/windows/powershell/rule_1.yml"), + benign_powershell_events(), + os.path.join(sigma_path(), "events/windows/powershell/rule_1"), + os.path.join(sigma_path(), "rules/windows/powershell/rule_1.yml"), ["Keyless"], ), - ( - os.path.join(sigma_path, "events/windows/registry/rule_1"), - os.path.join(sigma_path, "rules/windows/registry/rule_1.yml"), - ["winlog.event_data.Details", "winlog.event_data.TargetObject"], - ), - ( - os.path.join(sigma_path, "events/proxy/rule_1"), - os.path.join(sigma_path, "rules/proxy/rule_1.yml"), - ["url.full"], - ), ] @pytest.mark.parametrize( - "benign_events, matches_evasions_path,rule_path,search_fields", - search_fields_values, + "benign_events,matches_evasions_path,rule_path,search_fields", + events_search_fields, ) def test_create_filter_evasions_train_test_split( - self, benign_events, rule_path, evasions_path, search_fields + self, benign_events, matches_evasions_path, rule_path, search_fields ): - expected_train_size = expected_test_size = 23 - rule_data = RuleDataset() - rule_data.load_events_and_filter(evasions_path, rule_path) + rule_data.load_events_and_filter(matches_evasions_path, rule_path) train_test_split = rule_data.create_filter_evasions_train_test_split( benign_train_events=benign_events, benign_test_events=benign_events, search_fields=search_fields, ) - train_data = train_test_split.train_data - assert train_data.size == expected_train_size - - test_data = train_test_split.test_data - assert test_data.size == expected_test_size + assert train_test_split.train_data + assert train_test_split.test_data @pytest.mark.parametrize( - "benign_events, matches_evasions_path,rule_path,search_fields", - search_fields_values, + "benign_events,matches_evasions_path,rule_path,search_fields", + events_search_fields, ) def test_create_filter_evasions_train_test_split_with_seed( - self, benign_events, evasions_path, rule_path, search_fields + self, benign_events, matches_evasions_path, rule_path, search_fields ): rule_data = RuleDataset() - rule_data.load_events_and_filter(evasions_path, rule_path) + rule_data.load_events_and_filter(matches_evasions_path, rule_path) tt_split = rule_data.create_filter_evasions_train_test_split( benign_train_events=benign_events, @@ -622,22 +621,23 @@ def test_create_filter_evasions_train_test_split_with_seed( tt_split.test_data.labels, other_tt_split.test_data.labels ) - def test_create_matches_evasions_validation_split( - self, - benign_events, - rule_path, - matches_evasions_path, - ): + def test_create_matches_evasions_validation_split(self): expected_train_size = 23 expected_test_size = 21 expected_valid_size = 22 + events_path = os.path.join( + sigma_path(), "events/windows/process_creation/rule_1" + ) + rule_path = os.path.join( + sigma_path(), "rules/windows/process_creation/rule_1.yml" + ) rule_data = RuleDataset() - rule_data.load_events_and_filter(matches_evasions_path, rule_path) + rule_data.load_events_and_filter(events_path, rule_path) valid_split = rule_data.create_matches_evasions_validation_split( - benign_train_events=benign_events, - benign_test_events=benign_events, - benign_valid_events=benign_events, + benign_train_events=benign_pc_events(), + benign_test_events=benign_pc_events(), + benign_valid_events=benign_pc_events(), evasions_test_size=0.33, evasions_valid_size=0.66, evasions_split_seed=42, @@ -652,19 +652,23 @@ def test_create_matches_evasions_validation_split( valid_data = valid_split.validation_data assert valid_data.size == expected_valid_size - def test_create_filter_evasions_validaion_split( - self, benign_events, rule_path, matches_evasions_path - ): + def test_create_filter_evasions_validaion_split(self): expected_train_size = 23 expected_test_size = 21 expected_valid_size = 22 + events_path = os.path.join( + sigma_path(), "events/windows/process_creation/rule_1" + ) + rule_path = os.path.join( + sigma_path(), "rules/windows/process_creation/rule_1.yml" + ) rule_data = RuleDataset() - rule_data.load_events_and_filter(matches_evasions_path, rule_path) + rule_data.load_events_and_filter(events_path, rule_path) valid_split = rule_data.create_filter_evasions_validation_split( - benign_train_events=benign_events, - benign_test_events=benign_events, - benign_valid_events=benign_events, + benign_train_events=benign_pc_events(), + benign_test_events=benign_pc_events(), + benign_valid_events=benign_pc_events(), search_fields=["process.command_line"], evasions_test_size=0.33, evasions_valid_size=0.66, @@ -691,28 +695,31 @@ def empty_data_dir(self, tmpdir): return data evasions_rule_paths = [ - (pc_events_path(), pc_rules_path()), (powershell_events_path(), powershell_rules_path()), (proxy_events_path(), proxy_rules_path()), ] @pytest.mark.parametrize("evasions_path,rules_path", evasions_rule_paths) def test_load_rule_set_data(self, evasions_path, rules_path): - rule_set_data = RuleSetDataset(evasions_path, rules_path) + rule_set_data = RuleSetDataset() + rule_set_data.load_rule_set_data(evasions_path, rules_path) assert rule_set_data.get_rule_dataset_by_name("rule_1") def test_load_rule_set_data_invalid_rules_path(self): with pytest.raises(RuleSetDatasetError): - _ = RuleSetDataset(pc_events_path(), "/non/existing/path") + rule_set_data = RuleSetDataset() + rule_set_data.load_rule_set_data(pc_events_path(), "/non/existing/path") def test_load_rule_set_data_invalid_events_path(self): with pytest.raises(RuleSetDatasetError): - _ = RuleSetDataset("/non/existing/path", pc_rules_path()) + rule_set_data = RuleSetDataset() + rule_set_data.load_rule_set_data("/non/existing/path", pc_rules_path()) def test_create_matches_evasions_train_test_split(self): expected_train_size = expected_test_size = 26 benign_events = benign_pc_events() - rule_set_data = RuleSetDataset(pc_events_path(), pc_rules_path()) + rule_set_data = RuleSetDataset() + rule_set_data.load_rule_set_data(pc_events_path(), pc_rules_path()) tt_split = rule_set_data.create_matches_evasions_train_test_split( benign_train_events=benign_events, benign_test_events=benign_events ) @@ -725,7 +732,8 @@ def test_create_matches_evasions_valid_split(self): expected_test_size = expected_valid_size = 23 benign_events = benign_pc_events() - rule_set_data = RuleSetDataset(pc_events_path(), pc_rules_path()) + rule_set_data = RuleSetDataset() + rule_set_data.load_rule_set_data(pc_events_path(), pc_rules_path()) valid_split = rule_set_data.create_matches_evasions_validation_split( benign_train_events=benign_events, benign_test_events=benign_events, @@ -766,16 +774,16 @@ def test_create_matches_evasions_valid_split(self): ), ] + @pytest.mark.parametrize( + "matches_evasions_path,rules_path,search_fields,field_values", rules_data + ) def test_extract_field_values_from_filter( - self, evasions_path, rules_path, search_fields, field_values + self, matches_evasions_path, rules_path, search_fields, field_values ): - rule_set_data = RuleSetDataset(evasions_path, rules_path) + rule_set_data = RuleSetDataset() + rule_set_data.load_rule_set_data(matches_evasions_path, rules_path) assert ( - set( - rule_set_data.extract_field_values_from_filter( - search_fields=search_fields - ) - ) + rule_set_data.extract_field_values_from_filter(search_fields=search_fields) == field_values ) @@ -785,7 +793,9 @@ def test_create_filter_evasions_train_test_split( expected_train_size = expected_test_size = 26 benign_events = benign_pc_events() - rule_set_data = RuleSetDataset(pc_events_path(), pc_rules_path()) + rule_set_data = RuleSetDataset() + rule_set_data.load_rule_set_data(pc_events_path(), pc_rules_path()) + tt_split = rule_set_data.create_filter_evasions_train_test_split( benign_train_events=benign_events, benign_test_events=benign_events, @@ -797,7 +807,9 @@ def test_create_filter_evasions_train_test_split( def test_create_filter_evasions_valid_split_with_seed(self): benign_events = benign_pc_events() - rule_set = RuleSetDataset(pc_events_path(), pc_rules_path()) + rule_set = RuleSetDataset() + rule_set.load_rule_set_data(pc_events_path(), pc_rules_path()) + valid_split = rule_set.create_filter_evasions_validation_split( benign_train_events=benign_events, benign_test_events=benign_events, diff --git a/amides/tox.ini b/amides/tox.ini new file mode 100644 index 0000000..42da6d9 --- /dev/null +++ b/amides/tox.ini @@ -0,0 +1,13 @@ +[tox] +envlist = py{310,311} +minversion = 3.10 + +[testenv] +deps = -rrequirements_dev.txt + +[testenv:py{310,311}-tests] +description = Run unit tests +usedevelop = True +deps = {[testenv]deps} +commands = + pytest -vv tests/ {posargs} diff --git a/build_image.sh b/build_image.sh index b2779fd..b9e3391 100755 --- a/build_image.sh +++ b/build_image.sh @@ -2,6 +2,11 @@ AMIDES_TAG="amides:base" -echo "Building image '$AMIDES_TAG'..." +echo "########## Building AMIDES Docker image '$AMIDES_TAG'... ##########" docker build --tag $AMIDES_TAG . +if [ $? -eq 0 ]; then + echo "########## Successfully built AMIDES Docker image '$AMIDES_TAG' ##########" +else + echo "########## Failed to build AMIDES Docker image '$AMIDES_TAG' ##########" +fi diff --git a/cleanup.sh b/cleanup.sh index cf8f856..1338e4a 100755 --- a/cleanup.sh +++ b/cleanup.sh @@ -1,26 +1,19 @@ #!/bin/bash -AMIDES_IMAGE="amides:base" -AMIDES_RESULTS_CONTAINER="amides-results" -AMIDES_ENV_CONTAINER="amides-env" - -echo "Removing containers..." -./remove_containers.sh -echo "Removing image '$AMIDES_IMAGE'..." ./remove_image.sh -echo "Removing generated models..." -sudo rm -r ./amides/models/* +echo "########## Removing generated models... ##########" +rm -r ./amides/models if [ $? -eq 0 ]; then - echo "Successfully removed generated models" + echo "########## Successfully removed generated models ##########" else - echo "Failed to remove generated models" + echo "########## Failed to remove generated models ##########" fi -echo "Removing generated plots..." -sudo rm -r ./amides/plots/* +echo "########## Removing generated plots... ##########" +rm -r ./amides/plots if [ $? -eq 0 ]; then - echo "Successfully removed generated plots" + echo "########## Successfully removed generated plots ##########" else - echo "Failed to remove generated plots" + echo "########## Failed to remove generated plots ##########" fi diff --git a/create_containers.sh b/create_containers.sh deleted file mode 100755 index c491e79..0000000 --- a/create_containers.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/bash - -AMIDES_IMAGE="amides:base" -AMIDES_RESULTS_CONTAINER="amides-results" -AMIDES_ENV_CONTAINER="amides-env" - - -echo "Creating AMIDES results container '$AMIDES_RESULTS_CONTAINER'..." -docker create --name $AMIDES_RESULTS_CONTAINER --interactive --tty --mount type=bind,source="$(pwd)"/amides/models,target=/amides/models --mount type=bind,source="$(pwd)"/amides/plots,target=/amides/plots --mount type=bind,source="$(pwd)"/data,target=/data $AMIDES_IMAGE ./bin/results.sh -if [ $? -eq 0 ]; then - echo "Successfully created AMIDES results container '$AMIDES_RESULTS_CONTAINER'" -else - echo "Failed creating AMIDES results container '$AMIDES_RESULTS_CONTAINER'" -fi - -echo "Creating AMIDES environment container '$AMIDES_ENV_CONTAINER'..." -docker create --name $AMIDES_ENV_CONTAINER --interactive --tty --mount type=bind,source="$(pwd)"/amides/models,target=/amides/models --mount type=bind,source="$(pwd)"/amides/plots,target=/amides/plots --mount type=bind,source="$(pwd)"/data,target=/data $AMIDES_IMAGE /bin/bash -if [ $? -eq 0 ]; then - echo "Successfully created AMIDES environment container '$AMIDES_ENV_CONTAINER'" -else - echo "Failed creating AMIDES environment container '$AMIDES_ENV_CONTAINER'" -fi diff --git a/create_results.sh b/create_results.sh deleted file mode 100755 index c0f7641..0000000 --- a/create_results.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/bash - -AMIDES_IMAGE="amides:base" -AMIDES_RESULTS_CONTAINER="amides-results" - -echo "Starting AMIDES results container '$AMIDES_RESULTS_CONTAINER' ..." -docker start -i $AMIDES_RESULTS_CONTAINER -if [ $? -eq 0 ]; then - echo "Successfully executed AMIDES results container '$AMIDES_RESULTS_CONTAINER'" -else - echo "Failed to execute AMIDES results container '$AMIDES_RESULTS_CONTAINER'" -fi diff --git a/remove_containers.sh b/remove_containers.sh index 50b9249..9d8aa68 100755 --- a/remove_containers.sh +++ b/remove_containers.sh @@ -1,21 +1,21 @@ #!/bin/bash -AMIDES_RESULTS_CONTAINER="amides-results" +AMIDES_EXPERIMENTS_CONTAINER="amides-experiments" AMIDES_ENV_CONTAINER="amides-env" -echo "Removing AMIDES results container '$AMIDES_RESULTS_CONTAINER'..." -docker rm --force $AMIDES_RESULTS_CONTAINER +echo "########## Removing AMIDES results container '$AMIDES_EXPERIMENTS_CONTAINER'... ##########" +docker rm --force $AMIDES_EXPERIMENTS_CONTAINER if [ $? -eq 0 ]; then - echo "Successfully removed AMIDES results container '$AMIDES_RESULTS_CONTAINER'" + echo "########## Successfully removed AMIDES results container '$AMIDES_EXPERIMENTS_CONTAINER' ##########" else - echo "Failed to remove AMIDES results container '$AMIDES_RESULTS_CONTAINER'" + echo "########## Failed to remove AMIDES results container '$AMIDES_EXPERIMENTS_CONTAINER' ##########" fi -echo "Removing AMIDES env container '$AMIDES_ENV_CONTAINER'..." +echo "########## Removing AMIDES env container '$AMIDES_ENV_CONTAINER'... ##########" docker rm --force $AMIDES_ENV_CONTAINER if [ $? -eq 0 ]; then - echo "Successfully removed AMIDES env container '$AMIDES_ENV_CONTAINER'" + echo "########## Successfully removed AMIDES env container '$AMIDES_ENV_CONTAINER' ##########" else - echo "Failed to remove AMIDES env container '$AMIDES_ENV_CONTAINER'" + echo "########## Failed to remove AMIDES env container '$AMIDES_ENV_CONTAINER' ##########" fi diff --git a/remove_image.sh b/remove_image.sh index 795a289..d1fb530 100755 --- a/remove_image.sh +++ b/remove_image.sh @@ -2,10 +2,10 @@ AMIDES_IMAGE="amides:base" -echo "Removing AMIDES base image '$AMIDES_IMAGE'..." +echo "########## Removing AMIDES base image '$AMIDES_IMAGE'... ##########" docker image rm --force $AMIDES_IMAGE if [ $? -eq 0 ]; then - echo "Successfully removed AMIDES base image '$AMIDES_IMAGE'" + echo "########## Successfully removed AMIDES base image '$AMIDES_IMAGE' ##########" else - echo "Failed to remove AMIDES base image '$AMIDES_IMAGE'" + echo "########## Failed to remove AMIDES base image '$AMIDES_IMAGE' ##########" fi diff --git a/run_experiments.sh b/run_experiments.sh new file mode 100755 index 0000000..ce19002 --- /dev/null +++ b/run_experiments.sh @@ -0,0 +1,16 @@ +#!/usr/bin/bash + +AMIDES_IMAGE="amides:base" +AMIDES_EXPERIMENTS_CONTAINER="amides-experiments" +PWD=$(pwd) + +mkdir -p $PWD/amides/models +mkdir -p $PWD/amides/plots + +echo "########## Starting AMIDES experiments container '$AMIDES_EXPERIMENTS_CONTAINER' ... ##########" +docker run --rm --name $AMIDES_EXPERIMENTS_CONTAINER --interactive --tty --user docker-user --mount type=bind,source=$PWD/amides/models,target=/home/docker-user/amides/models --mount type=bind,source=$PWD/amides/plots,target=/home/docker-user/amides/plots --mount type=bind,source=$PWD/data,target=/home/docker-user/data $AMIDES_IMAGE ./experiments.sh +if [ $? -eq 0 ]; then + echo "########## Successfully executed AMIDES experiments container '$AMIDES_EXPERIMENTS_CONTAINER' ##########" +else + echo "########## Failed to execute AMIDES experiments container '$AMIDES_EXPERIMENTS_CONTAINER' ##########" +fi diff --git a/start_env.sh b/start_env.sh index 431da3b..910d743 100755 --- a/start_env.sh +++ b/start_env.sh @@ -2,11 +2,12 @@ AMIDES_IMAGE="amides:base" AMIDES_ENV_CONTAINER="amides-env" +PWD=$(pwd) -echo "Starting AMIDES environment container '$AMIDES_ENV_CONTAINER'..." -docker start -i $AMIDES_ENV_CONTAINER +echo "########## Starting AMIDES environment container '$AMIDES_ENV_CONTAINER'... ##########" +docker run --name $AMIDES_ENV_CONTAINER --interactive --rm --tty --user docker-user --mount type=bind,source=$PWD/amides/models,target=/home/docker-user/amides/models --mount type=bind,source=$PWD/amides/plots,target=/home/docker-user/amides/plots --mount type=bind,source=$PWD/data,target=/home/docker-user/data $AMIDES_IMAGE /bin/bash if [ $? -eq 0 ]; then - echo "Successfully executed AMIDES environment container '$AMIDES_ENV_CONTAINER'" + echo "########## Successfully executed AMIDES environment container '$AMIDES_ENV_CONTAINER' ##########" else - echo "Failed to execute AMIDES environment container '$AMIDES_ENV_CONTAINER'" + echo "########## Failed to execute AMIDES environment container '$AMIDES_ENV_CONTAINER' ##########" fi