Skip to content

Commit

Permalink
Merge pull request #38 from opencybersecurityalliance/develop
Browse files Browse the repository at this point in the history
v1.0.4
  • Loading branch information
subbyte authored Jun 8, 2021
2 parents 1a5480e + 0e7cf42 commit 29106de
Show file tree
Hide file tree
Showing 16 changed files with 118 additions and 78 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/code-style.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ jobs:
python -m pip install black
python -m pip install .
- name: Code style check
run: python -m black --check src/
run: black --check src/
4 changes: 2 additions & 2 deletions .github/workflows/publish-to-pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@ jobs:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
run: |
python -m build --sdist --wheel --outdir dist/ .
python -m twine upload dist/*
build --sdist --wheel --outdir dist/ .
twine upload dist/*
2 changes: 1 addition & 1 deletion .github/workflows/unit-testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,4 @@ jobs:
python -m pip install pytest
python -m pip install .
- name: Unit testing
run: python -m pytest
run: pytest -vv
8 changes: 4 additions & 4 deletions .github/workflows/unused-import.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Unused import check
name: Unused imports check

on:
push:
Expand All @@ -17,7 +17,7 @@ on:
- review_requested

jobs:
unusedimport:
unusedimports:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
Expand All @@ -31,5 +31,5 @@ jobs:
python -m pip install --upgrade setuptools
python -m pip install unimport
python -m pip install .
- name: Unused import check
run: python -m unimport --check --exclude __init__.py src/
- name: Check
run: unimport --check --exclude __init__.py src/
2 changes: 2 additions & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ Contributors
------------

- `Charlie Wu`_
- `Jill Casavant`_

.. _Xiaokui Shu: https://github.com/subbyte
.. _Paul Coccoli: https://github.com/pcoccoli
.. _Charlie Wu: https://github.com/charliewutw
.. _Jill Casavant: https://github.com/jmcasava
19 changes: 19 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,25 @@ All notable changes to this project will be documented in this file.

The format is based on `Keep a Changelog`_.

1.0.4 (2021-06-08)
==================

Added
-----

- GitHub action for pull requests
- Unit testing
- Code style check
- Unused imports check
- GitHub issue templates

Changed
-------

- More comprehensive entity identification logic
- Use firepit.merge() to implement prefetch merge
- Typo fix in doc

1.0.3 (2021-05-31)
==================

Expand Down
2 changes: 1 addition & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Kestrel Threat Hunting Language

Kestrel threat hunting language provides an abstraction for threat hunters to
focus on the high-value and composable threat hypothesis development instead of
specific realization of hypotheis testing with heterogeneous data sources,
specific realization of hypothesis testing with heterogeneous data sources,
threat intelligence, and public or proprietary analytics.

.. toctree::
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = kestrel-lang
version = 1.0.3
version = 1.0.4
description = Kestrel Threat Hunting Language
long_description = file:README.rst
long_description_content_type = text/x-rst
Expand Down
79 changes: 38 additions & 41 deletions src/kestrel/codegen/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,8 @@
# - a table that can be imported to pandas dataframe
################################################################

import os
import pathlib
import functools
import logging
import time
import itertools
from collections import OrderedDict

Expand Down Expand Up @@ -233,10 +230,8 @@ def get(stmt, session):
output = _output

if session.config["prefetch"]["get"] and len(_output):

prefetch_ret_var_name = return_var_name + "_prefetch"

pattern_pf = _prefetch(
prefetch_ret_entity_table = _prefetch(
return_type,
prefetch_ret_var_name,
local_var_name,
Expand All @@ -248,20 +243,16 @@ def get(stmt, session):
session.session_id,
session.data_source_manager,
)
else:
prefetch_ret_entity_table = None

if pattern_pf:
# this is a fix when the unique identifier in
# `kestrel.codegen.relations.stix_2_0_identical_mapping` is
# missing, especially for process.

# TODO: this or_pattern() code can be removed when we have
# better logic of unique identifier of entities.

full_pat = or_patterns([pattern, pattern_pf])
session.store.extract(return_var_name, return_type, None, full_pat)
output = new_var(
session.store, return_var_name, [], stmt, session.symtable
)
if prefetch_ret_entity_table:
session.store.merge(
return_var_name, [local_var_name, prefetch_ret_entity_table]
)
output = new_var(session.store, return_var_name, [], stmt, session.symtable)
else:
output = new_var(session.store, local_var_name, [], stmt, session.symtable)

else:
raise KestrelInternalError(f"unknown type of source in {str(stmt)}")
Expand Down Expand Up @@ -373,9 +364,10 @@ def find(stmt, session):

# Second, prefetch all records of the entities and associated entities
if session.config["prefetch"]["find"] and len(_output) and _output.data_source:
if _prefetch(
prefetch_ret_var_name = return_var_name + "_prefetch"
prefetch_ret_entity_table = _prefetch(
return_type,
return_var_name,
prefetch_ret_var_name,
local_var_name,
time_range,
start_offset,
Expand All @@ -384,10 +376,17 @@ def find(stmt, session):
session.store,
session.session_id,
session.data_source_manager,
):
output = new_var(
session.store, return_var_name, [], stmt, session.symtable
)
)
else:
prefetch_ret_entity_table = None

if prefetch_ret_entity_table:
session.store.merge(
return_var_name, [local_var_name, prefetch_ret_entity_table]
)
output = new_var(session.store, return_var_name, [], stmt, session.symtable)
else:
output = new_var(session.store, local_var_name, [], stmt, session.symtable)

return output, None

Expand Down Expand Up @@ -490,27 +489,25 @@ def _prefetch(
session_id (str): session ID.
Returns:
[str]: pattern if the prefetch is performed.
str: the entity table in store if the prefetch is performed else None.
"""
# only need to return bool in the future

pattern_body = compile_identical_entity_search_pattern(return_type, input_var_name)
pattern_body = compile_identical_entity_search_pattern(
input_var_name, symtable[input_var_name]
)

if pattern_body:
# this may fail if the attribute in `stix_2_0_identical_mapping` does not exists
# this is important since STIX does not have any mandatory attributes for process/file
remote_pattern = build_pattern(
pattern_body, time_range, start_offset, end_offset, symtable, store
)
remote_pattern = build_pattern(
pattern_body, time_range, start_offset, end_offset, symtable, store
)

if remote_pattern:
data_source = symtable[input_var_name].data_source
resp = ds_manager.query(data_source, remote_pattern, session_id)
query_id = resp.load_to_store(store)
if remote_pattern:
data_source = symtable[input_var_name].data_source
resp = ds_manager.query(data_source, remote_pattern, session_id)
query_id = resp.load_to_store(store)

# build the return_var_name view in store
store.extract(return_var_name, return_type, None, remote_pattern)
# build the return_var_name view in store
store.extract(return_var_name, return_type, query_id, remote_pattern)

return remote_pattern
return return_var_name

return None
1 change: 0 additions & 1 deletion src/kestrel/codegen/display.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from abc import ABC, abstractmethod
from pandas import DataFrame
from numpy import nan
import json

from kestrel.exceptions import KestrelInternalError
Expand Down
1 change: 0 additions & 1 deletion src/kestrel/codegen/pattern.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import itertools
import dateutil.parser
import datetime
import logging
Expand Down
45 changes: 34 additions & 11 deletions src/kestrel/codegen/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

import logging

from firepit.query import Query, Projection, Table, Unique

_logger = logging.getLogger(__name__)

stix_2_0_ref_mapping = {
Expand Down Expand Up @@ -61,8 +63,9 @@
("windows-service-ext", "loaded", "user-account"): (["creator_user_ref"], True),
}

# the first available attribute will be used to uniquely identify the entity
stix_2_0_identical_mapping = {
# entity-type: combination of attributes used for identical entity lookup
# entity-type: id attributes candidates
"directory": ("path",),
"domain-name": ("value",),
"email-addr": ("value",),
Expand All @@ -74,7 +77,7 @@
# `pid` is optional in STIX standard
# `first_observed` cannot be used since it may be wrong (derived from observation)
# `command_line` or `name` may not be in data and cannot be used
"process": ("pid",),
"process": ("pid", "name"),
"software": ("name",),
"url": ("value",),
"user-account": ("user_id",), # optional in STIX standard
Expand Down Expand Up @@ -104,6 +107,31 @@
)


def get_entity_id_attribute(variable):
# this function should always return something
# if no entity id attribute found, fall back to record "id" by default
# this works for:
# - no appriparite identifier attribute found given specific data
# - "network-traffic" (not in stix_2_0_identical_mapping)
id_attr = "id"

if variable.type in stix_2_0_identical_mapping:
available_attributes = variable.store.columns(variable.entity_table)
for attr in stix_2_0_identical_mapping[variable.type]:
if attr in available_attributes:
query = Query()
query.append(Table(variable.entity_table))
query.append(Projection([attr]))
query.append(Unique())
rows = variable.store.run_query(query).fetchall()
all_values = [row[attr] for row in rows if row[attr]]
if all_values:
id_attr = attr
break

return id_attr


def are_entities_associated_with_x_ibm_event(entity_types):
flags = [entity_type in stix_x_ibm_event_mapping for entity_type in entity_types]
return all(flags)
Expand Down Expand Up @@ -133,15 +161,10 @@ def compile_specific_relation_to_pattern(
return pattern


def compile_identical_entity_search_pattern(entity_type, var_name):
comp_exps = []
if entity_type in stix_2_0_identical_mapping:
for attribute in stix_2_0_identical_mapping[entity_type]:
comp_exps.append(f"{entity_type}:{attribute} = {var_name}.{attribute}")
pattern = "[" + " AND ".join(comp_exps) + "]"
_logger.debug(f"identical entity search pattern compiled: {pattern}")
else:
pattern = None
def compile_identical_entity_search_pattern(var_name, var_struct):
attribute = get_entity_id_attribute(var_struct)
pattern = f"[{var_struct.type}:{attribute} = {var_name}.{attribute}]"
_logger.debug(f"identical entity search pattern compiled: {pattern}")
return pattern


Expand Down
22 changes: 12 additions & 10 deletions src/kestrel/codegen/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
Join,
)
from collections import OrderedDict
from kestrel.codegen.relations import all_entity_types, stix_2_0_identical_mapping
from kestrel.codegen.relations import all_entity_types, get_entity_id_attribute
from kestrel.exceptions import KestrelInternalError


Expand All @@ -35,12 +35,19 @@ def gen_variable_summary(var_name, var_struct):

query_ids = _get_variable_query_ids(var_struct)

is_from_direct_datasource = False
var_birth_cmd = var_struct.birth_statement["command"]
if var_birth_cmd == "find" or (
var_birth_cmd == "get" and "datasource" in var_struct.birth_statement
):
is_from_direct_datasource = True

for table in var_struct.store.tables():

if table in all_entity_types:
count = 0

if query_ids:
if query_ids and is_from_direct_datasource:
query_ids_filter = Filter([Predicate("query_id", "IN", query_ids)])
query = Query()
query.append(Table(table))
Expand All @@ -65,8 +72,7 @@ def _get_variable_query_ids(variable):
if variable.entity_table:
query = Query()
query.append(Table("__queries"))
query.append(Join("__membership", "sco_id", "=", "sco_id"))
query.append(Filter([Predicate("var", "=", variable.entity_table)]))
query.append(Join(variable.entity_table, "sco_id", "=", "id"))
query.append(Projection(["query_id"]))
query.append(Unique())
rows = variable.store.run_query(query).fetchall()
Expand All @@ -80,12 +86,8 @@ def get_variable_entity_count(variable):
if variable.entity_table:
query = Query()
query.append(Table(variable.entity_table))
cols = (
stix_2_0_identical_mapping[variable.type]
if variable.type in stix_2_0_identical_mapping
else ["id"]
)
query.append(Projection(cols))
entity_id_attr = get_entity_id_attribute(variable)
query.append(Projection([entity_id_attr]))
query.append(Unique())
query.append(Count())
rows = variable.store.run_query(query).fetchall()
Expand Down
1 change: 0 additions & 1 deletion src/kestrel_analytics_docker/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import docker
import logging
import pandas
import pickle

from kestrel.analytics import AbstractAnalyticsInterface
from kestrel.exceptions import (
Expand Down
2 changes: 1 addition & 1 deletion tests/test_command_find.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def test_return_table_not_exist(fake_bundle_file):
correct_dict = {
"display": "execution summary",
"data": {
"execution time": 1,
"variables updated": [
{
"VARIABLE": "conns",
Expand All @@ -47,4 +46,5 @@ def test_return_table_not_exist(fake_bundle_file):
},
}
output_dict = summaries[0].to_dict()
del output_dict["data"]["execution time"]
assert output_dict == correct_dict
Loading

0 comments on commit 29106de

Please sign in to comment.