diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml index 4dd53c5d..954a15e6 100644 --- a/.github/workflows/code-style.yml +++ b/.github/workflows/code-style.yml @@ -32,4 +32,4 @@ jobs: python -m pip install black python -m pip install . - name: Code style check - run: python -m black --check src/ + run: black --check src/ diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index a3f829b7..e387c53a 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -23,5 +23,5 @@ jobs: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} run: | - python -m build --sdist --wheel --outdir dist/ . - python -m twine upload dist/* + build --sdist --wheel --outdir dist/ . + twine upload dist/* diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index db7ad28b..5678c20a 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -40,4 +40,4 @@ jobs: python -m pip install pytest python -m pip install . - name: Unit testing - run: python -m pytest + run: pytest -vv diff --git a/.github/workflows/unused-import.yml b/.github/workflows/unused-import.yml index 1429b838..22da3492 100644 --- a/.github/workflows/unused-import.yml +++ b/.github/workflows/unused-import.yml @@ -1,4 +1,4 @@ -name: Unused import check +name: Unused imports check on: push: @@ -17,7 +17,7 @@ on: - review_requested jobs: - unusedimport: + unusedimports: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 @@ -31,5 +31,5 @@ jobs: python -m pip install --upgrade setuptools python -m pip install unimport python -m pip install . - - name: Unused import check - run: python -m unimport --check --exclude __init__.py src/ + - name: Check + run: unimport --check --exclude __init__.py src/ diff --git a/AUTHORS.rst b/AUTHORS.rst index 5939d92a..3f044ba9 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -12,7 +12,9 @@ Contributors ------------ - `Charlie Wu`_ +- `Jill Casavant`_ .. _Xiaokui Shu: https://github.com/subbyte .. _Paul Coccoli: https://github.com/pcoccoli .. _Charlie Wu: https://github.com/charliewutw +.. _Jill Casavant: https://github.com/jmcasava diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 95472d1c..ed90104f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,25 @@ All notable changes to this project will be documented in this file. The format is based on `Keep a Changelog`_. +1.0.4 (2021-06-08) +================== + +Added +----- + +- GitHub action for pull requests + - Unit testing + - Code style check + - Unused imports check +- GitHub issue templates + +Changed +------- + +- More comprehensive entity identification logic +- Use firepit.merge() to implement prefetch merge +- Typo fix in doc + 1.0.3 (2021-05-31) ================== diff --git a/docs/index.rst b/docs/index.rst index 631be2f3..33bb77cc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,7 +6,7 @@ Kestrel Threat Hunting Language Kestrel threat hunting language provides an abstraction for threat hunters to focus on the high-value and composable threat hypothesis development instead of -specific realization of hypotheis testing with heterogeneous data sources, +specific realization of hypothesis testing with heterogeneous data sources, threat intelligence, and public or proprietary analytics. .. toctree:: diff --git a/setup.cfg b/setup.cfg index b0edc386..3a61c27e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = kestrel-lang -version = 1.0.3 +version = 1.0.4 description = Kestrel Threat Hunting Language long_description = file:README.rst long_description_content_type = text/x-rst diff --git a/src/kestrel/codegen/commands.py b/src/kestrel/codegen/commands.py index 6aa56aeb..602e3b6a 100644 --- a/src/kestrel/codegen/commands.py +++ b/src/kestrel/codegen/commands.py @@ -18,11 +18,8 @@ # - a table that can be imported to pandas dataframe ################################################################ -import os -import pathlib import functools import logging -import time import itertools from collections import OrderedDict @@ -233,10 +230,8 @@ def get(stmt, session): output = _output if session.config["prefetch"]["get"] and len(_output): - prefetch_ret_var_name = return_var_name + "_prefetch" - - pattern_pf = _prefetch( + prefetch_ret_entity_table = _prefetch( return_type, prefetch_ret_var_name, local_var_name, @@ -248,20 +243,16 @@ def get(stmt, session): session.session_id, session.data_source_manager, ) + else: + prefetch_ret_entity_table = None - if pattern_pf: - # this is a fix when the unique identifier in - # `kestrel.codegen.relations.stix_2_0_identical_mapping` is - # missing, especially for process. - - # TODO: this or_pattern() code can be removed when we have - # better logic of unique identifier of entities. - - full_pat = or_patterns([pattern, pattern_pf]) - session.store.extract(return_var_name, return_type, None, full_pat) - output = new_var( - session.store, return_var_name, [], stmt, session.symtable - ) + if prefetch_ret_entity_table: + session.store.merge( + return_var_name, [local_var_name, prefetch_ret_entity_table] + ) + output = new_var(session.store, return_var_name, [], stmt, session.symtable) + else: + output = new_var(session.store, local_var_name, [], stmt, session.symtable) else: raise KestrelInternalError(f"unknown type of source in {str(stmt)}") @@ -373,9 +364,10 @@ def find(stmt, session): # Second, prefetch all records of the entities and associated entities if session.config["prefetch"]["find"] and len(_output) and _output.data_source: - if _prefetch( + prefetch_ret_var_name = return_var_name + "_prefetch" + prefetch_ret_entity_table = _prefetch( return_type, - return_var_name, + prefetch_ret_var_name, local_var_name, time_range, start_offset, @@ -384,10 +376,17 @@ def find(stmt, session): session.store, session.session_id, session.data_source_manager, - ): - output = new_var( - session.store, return_var_name, [], stmt, session.symtable - ) + ) + else: + prefetch_ret_entity_table = None + + if prefetch_ret_entity_table: + session.store.merge( + return_var_name, [local_var_name, prefetch_ret_entity_table] + ) + output = new_var(session.store, return_var_name, [], stmt, session.symtable) + else: + output = new_var(session.store, local_var_name, [], stmt, session.symtable) return output, None @@ -490,27 +489,25 @@ def _prefetch( session_id (str): session ID. Returns: - [str]: pattern if the prefetch is performed. + str: the entity table in store if the prefetch is performed else None. """ - # only need to return bool in the future - pattern_body = compile_identical_entity_search_pattern(return_type, input_var_name) + pattern_body = compile_identical_entity_search_pattern( + input_var_name, symtable[input_var_name] + ) - if pattern_body: - # this may fail if the attribute in `stix_2_0_identical_mapping` does not exists - # this is important since STIX does not have any mandatory attributes for process/file - remote_pattern = build_pattern( - pattern_body, time_range, start_offset, end_offset, symtable, store - ) + remote_pattern = build_pattern( + pattern_body, time_range, start_offset, end_offset, symtable, store + ) - if remote_pattern: - data_source = symtable[input_var_name].data_source - resp = ds_manager.query(data_source, remote_pattern, session_id) - query_id = resp.load_to_store(store) + if remote_pattern: + data_source = symtable[input_var_name].data_source + resp = ds_manager.query(data_source, remote_pattern, session_id) + query_id = resp.load_to_store(store) - # build the return_var_name view in store - store.extract(return_var_name, return_type, None, remote_pattern) + # build the return_var_name view in store + store.extract(return_var_name, return_type, query_id, remote_pattern) - return remote_pattern + return return_var_name return None diff --git a/src/kestrel/codegen/display.py b/src/kestrel/codegen/display.py index 5185131c..250c649b 100644 --- a/src/kestrel/codegen/display.py +++ b/src/kestrel/codegen/display.py @@ -1,6 +1,5 @@ from abc import ABC, abstractmethod from pandas import DataFrame -from numpy import nan import json from kestrel.exceptions import KestrelInternalError diff --git a/src/kestrel/codegen/pattern.py b/src/kestrel/codegen/pattern.py index 5e1afaff..8b6e88f1 100644 --- a/src/kestrel/codegen/pattern.py +++ b/src/kestrel/codegen/pattern.py @@ -1,4 +1,3 @@ -import itertools import dateutil.parser import datetime import logging diff --git a/src/kestrel/codegen/relations.py b/src/kestrel/codegen/relations.py index e0fc8831..9c8c73dc 100644 --- a/src/kestrel/codegen/relations.py +++ b/src/kestrel/codegen/relations.py @@ -8,6 +8,8 @@ import logging +from firepit.query import Query, Projection, Table, Unique + _logger = logging.getLogger(__name__) stix_2_0_ref_mapping = { @@ -61,8 +63,9 @@ ("windows-service-ext", "loaded", "user-account"): (["creator_user_ref"], True), } +# the first available attribute will be used to uniquely identify the entity stix_2_0_identical_mapping = { - # entity-type: combination of attributes used for identical entity lookup + # entity-type: id attributes candidates "directory": ("path",), "domain-name": ("value",), "email-addr": ("value",), @@ -74,7 +77,7 @@ # `pid` is optional in STIX standard # `first_observed` cannot be used since it may be wrong (derived from observation) # `command_line` or `name` may not be in data and cannot be used - "process": ("pid",), + "process": ("pid", "name"), "software": ("name",), "url": ("value",), "user-account": ("user_id",), # optional in STIX standard @@ -104,6 +107,31 @@ ) +def get_entity_id_attribute(variable): + # this function should always return something + # if no entity id attribute found, fall back to record "id" by default + # this works for: + # - no appriparite identifier attribute found given specific data + # - "network-traffic" (not in stix_2_0_identical_mapping) + id_attr = "id" + + if variable.type in stix_2_0_identical_mapping: + available_attributes = variable.store.columns(variable.entity_table) + for attr in stix_2_0_identical_mapping[variable.type]: + if attr in available_attributes: + query = Query() + query.append(Table(variable.entity_table)) + query.append(Projection([attr])) + query.append(Unique()) + rows = variable.store.run_query(query).fetchall() + all_values = [row[attr] for row in rows if row[attr]] + if all_values: + id_attr = attr + break + + return id_attr + + def are_entities_associated_with_x_ibm_event(entity_types): flags = [entity_type in stix_x_ibm_event_mapping for entity_type in entity_types] return all(flags) @@ -133,15 +161,10 @@ def compile_specific_relation_to_pattern( return pattern -def compile_identical_entity_search_pattern(entity_type, var_name): - comp_exps = [] - if entity_type in stix_2_0_identical_mapping: - for attribute in stix_2_0_identical_mapping[entity_type]: - comp_exps.append(f"{entity_type}:{attribute} = {var_name}.{attribute}") - pattern = "[" + " AND ".join(comp_exps) + "]" - _logger.debug(f"identical entity search pattern compiled: {pattern}") - else: - pattern = None +def compile_identical_entity_search_pattern(var_name, var_struct): + attribute = get_entity_id_attribute(var_struct) + pattern = f"[{var_struct.type}:{attribute} = {var_name}.{attribute}]" + _logger.debug(f"identical entity search pattern compiled: {pattern}") return pattern diff --git a/src/kestrel/codegen/summary.py b/src/kestrel/codegen/summary.py index b1022dbc..35188478 100644 --- a/src/kestrel/codegen/summary.py +++ b/src/kestrel/codegen/summary.py @@ -9,7 +9,7 @@ Join, ) from collections import OrderedDict -from kestrel.codegen.relations import all_entity_types, stix_2_0_identical_mapping +from kestrel.codegen.relations import all_entity_types, get_entity_id_attribute from kestrel.exceptions import KestrelInternalError @@ -35,12 +35,19 @@ def gen_variable_summary(var_name, var_struct): query_ids = _get_variable_query_ids(var_struct) + is_from_direct_datasource = False + var_birth_cmd = var_struct.birth_statement["command"] + if var_birth_cmd == "find" or ( + var_birth_cmd == "get" and "datasource" in var_struct.birth_statement + ): + is_from_direct_datasource = True + for table in var_struct.store.tables(): if table in all_entity_types: count = 0 - if query_ids: + if query_ids and is_from_direct_datasource: query_ids_filter = Filter([Predicate("query_id", "IN", query_ids)]) query = Query() query.append(Table(table)) @@ -65,8 +72,7 @@ def _get_variable_query_ids(variable): if variable.entity_table: query = Query() query.append(Table("__queries")) - query.append(Join("__membership", "sco_id", "=", "sco_id")) - query.append(Filter([Predicate("var", "=", variable.entity_table)])) + query.append(Join(variable.entity_table, "sco_id", "=", "id")) query.append(Projection(["query_id"])) query.append(Unique()) rows = variable.store.run_query(query).fetchall() @@ -80,12 +86,8 @@ def get_variable_entity_count(variable): if variable.entity_table: query = Query() query.append(Table(variable.entity_table)) - cols = ( - stix_2_0_identical_mapping[variable.type] - if variable.type in stix_2_0_identical_mapping - else ["id"] - ) - query.append(Projection(cols)) + entity_id_attr = get_entity_id_attribute(variable) + query.append(Projection([entity_id_attr])) query.append(Unique()) query.append(Count()) rows = variable.store.run_query(query).fetchall() diff --git a/src/kestrel_analytics_docker/interface.py b/src/kestrel_analytics_docker/interface.py index f9970ab8..8c3cbc1a 100644 --- a/src/kestrel_analytics_docker/interface.py +++ b/src/kestrel_analytics_docker/interface.py @@ -25,7 +25,6 @@ import docker import logging import pandas -import pickle from kestrel.analytics import AbstractAnalyticsInterface from kestrel.exceptions import ( diff --git a/tests/test_command_find.py b/tests/test_command_find.py index d694bd55..b50160c3 100644 --- a/tests/test_command_find.py +++ b/tests/test_command_find.py @@ -22,7 +22,6 @@ def test_return_table_not_exist(fake_bundle_file): correct_dict = { "display": "execution summary", "data": { - "execution time": 1, "variables updated": [ { "VARIABLE": "conns", @@ -47,4 +46,5 @@ def test_return_table_not_exist(fake_bundle_file): }, } output_dict = summaries[0].to_dict() + del output_dict["data"]["execution time"] assert output_dict == correct_dict diff --git a/tests/test_display.py b/tests/test_display.py index f89c1d5f..393dadc9 100644 --- a/tests/test_display.py +++ b/tests/test_display.py @@ -30,7 +30,6 @@ def test_display_block_summary_to_dict(): correct_dict = { "display": "execution summary", "data": { - "execution time": 1, "variables updated": [ { "VARIABLE": "newvar", @@ -44,6 +43,7 @@ def test_display_block_summary_to_dict(): }, } output_dict = d[0].to_dict() + del output_dict["data"]["execution time"] assert output_dict == correct_dict @@ -61,7 +61,6 @@ def test_display_block_summary_get_from_variable(): correct_dict = { "display": "execution summary", "data": { - "execution time": 1, "variables updated": [ { "VARIABLE": "newvar", @@ -82,4 +81,5 @@ def test_display_block_summary_get_from_variable(): }, } output_dict = d[0].to_dict() + del output_dict["data"]["execution time"] assert output_dict == correct_dict