Merge pull request #38 from opencybersecurityalliance/develop

v1.0.4
opencybersecurityalliance · Jun 8, 2021 · 29106de · 29106de
2 parents 1a5480e + 0e7cf42
commit 29106de
Show file tree

Hide file tree

Showing 16 changed files with 118 additions and 78 deletions.
diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml
@@ -32,4 +32,4 @@ jobs:
           python -m pip install black
           python -m pip install .
       - name: Code style check
-        run: python -m black --check src/
+        run: black --check src/
diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
@@ -23,5 +23,5 @@ jobs:
           TWINE_USERNAME: __token__
           TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
         run: |
-          python -m build --sdist --wheel --outdir dist/ .
-          python -m twine upload dist/*
+          build --sdist --wheel --outdir dist/ .
+          twine upload dist/*
diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml
@@ -40,4 +40,4 @@ jobs:
           python -m pip install pytest
           python -m pip install .
       - name: Unit testing
-        run: python -m pytest
+        run: pytest -vv
diff --git a/.github/workflows/unused-import.yml b/.github/workflows/unused-import.yml
@@ -1,4 +1,4 @@
-name: Unused import check
+name: Unused imports check
 
 on:
   push:
@@ -17,7 +17,7 @@ on:
       - review_requested
 
 jobs:
-  unusedimport:
+  unusedimports:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
@@ -31,5 +31,5 @@ jobs:
           python -m pip install --upgrade setuptools
           python -m pip install unimport
           python -m pip install .
-      - name: Unused import check
-        run: python -m unimport --check --exclude __init__.py src/
+      - name: Check
+        run: unimport --check --exclude __init__.py src/
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -12,7 +12,9 @@ Contributors
 ------------
 
 - `Charlie Wu`_
+- `Jill Casavant`_
 
 .. _Xiaokui Shu: https://github.com/subbyte
 .. _Paul Coccoli: https://github.com/pcoccoli
 .. _Charlie Wu: https://github.com/charliewutw
+.. _Jill Casavant: https://github.com/jmcasava
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,25 @@ All notable changes to this project will be documented in this file.
 
 The format is based on `Keep a Changelog`_.
 
+1.0.4 (2021-06-08)
+==================
+
+Added
+-----
+
+- GitHub action for pull requests
+    - Unit testing
+    - Code style check
+    - Unused imports check
+- GitHub issue templates
+
+Changed
+-------
+
+- More comprehensive entity identification logic
+- Use firepit.merge() to implement prefetch merge
+- Typo fix in doc
+
 1.0.3 (2021-05-31)
 ==================
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -6,7 +6,7 @@ Kestrel Threat Hunting Language
 
 Kestrel threat hunting language provides an abstraction for threat hunters to
 focus on the high-value and composable threat hypothesis development instead of
-specific realization of hypotheis testing with heterogeneous data sources,
+specific realization of hypothesis testing with heterogeneous data sources,
 threat intelligence, and public or proprietary analytics.
 
 .. toctree::

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = kestrel-lang
-version = 1.0.3
+version = 1.0.4
 description = Kestrel Threat Hunting Language
 long_description = file:README.rst
 long_description_content_type = text/x-rst

diff --git a/src/kestrel/codegen/commands.py b/src/kestrel/codegen/commands.py
@@ -18,11 +18,8 @@
 #     - a table that can be imported to pandas dataframe
 ################################################################
 
-import os
-import pathlib
 import functools
 import logging
-import time
 import itertools
 from collections import OrderedDict
 
@@ -233,10 +230,8 @@ def get(stmt, session):
         output = _output
 
         if session.config["prefetch"]["get"] and len(_output):
-
             prefetch_ret_var_name = return_var_name + "_prefetch"
-
-            pattern_pf = _prefetch(
+            prefetch_ret_entity_table = _prefetch(
                 return_type,
                 prefetch_ret_var_name,
                 local_var_name,
@@ -248,20 +243,16 @@ def get(stmt, session):
                 session.session_id,
                 session.data_source_manager,
             )
+        else:
+            prefetch_ret_entity_table = None
 
-            if pattern_pf:
-                # this is a fix when the unique identifier in
-                # `kestrel.codegen.relations.stix_2_0_identical_mapping` is
-                # missing, especially for process.
-
-                # TODO: this or_pattern() code can be removed when we have
-                # better logic of unique identifier of entities.
-
-                full_pat = or_patterns([pattern, pattern_pf])
-                session.store.extract(return_var_name, return_type, None, full_pat)
-                output = new_var(
-                    session.store, return_var_name, [], stmt, session.symtable
-                )
+        if prefetch_ret_entity_table:
+            session.store.merge(
+                return_var_name, [local_var_name, prefetch_ret_entity_table]
+            )
+            output = new_var(session.store, return_var_name, [], stmt, session.symtable)
+        else:
+            output = new_var(session.store, local_var_name, [], stmt, session.symtable)
 
     else:
         raise KestrelInternalError(f"unknown type of source in {str(stmt)}")
@@ -373,9 +364,10 @@ def find(stmt, session):
 
         # Second, prefetch all records of the entities and associated entities
         if session.config["prefetch"]["find"] and len(_output) and _output.data_source:
-            if _prefetch(
+            prefetch_ret_var_name = return_var_name + "_prefetch"
+            prefetch_ret_entity_table = _prefetch(
                 return_type,
-                return_var_name,
+                prefetch_ret_var_name,
                 local_var_name,
                 time_range,
                 start_offset,
@@ -384,10 +376,17 @@ def find(stmt, session):
                 session.store,
                 session.session_id,
                 session.data_source_manager,
-            ):
-                output = new_var(
-                    session.store, return_var_name, [], stmt, session.symtable
-                )
+            )
+        else:
+            prefetch_ret_entity_table = None
+
+        if prefetch_ret_entity_table:
+            session.store.merge(
+                return_var_name, [local_var_name, prefetch_ret_entity_table]
+            )
+            output = new_var(session.store, return_var_name, [], stmt, session.symtable)
+        else:
+            output = new_var(session.store, local_var_name, [], stmt, session.symtable)
 
     return output, None
 
@@ -490,27 +489,25 @@ def _prefetch(
         session_id (str): session ID.
 
     Returns:
-        [str]: pattern if the prefetch is performed.
+        str: the entity table in store if the prefetch is performed else None.
     """
-    # only need to return bool in the future
 
-    pattern_body = compile_identical_entity_search_pattern(return_type, input_var_name)
+    pattern_body = compile_identical_entity_search_pattern(
+        input_var_name, symtable[input_var_name]
+    )
 
-    if pattern_body:
-        # this may fail if the attribute in `stix_2_0_identical_mapping` does not exists
-        # this is important since STIX does not have any mandatory attributes for process/file
-        remote_pattern = build_pattern(
-            pattern_body, time_range, start_offset, end_offset, symtable, store
-        )
+    remote_pattern = build_pattern(
+        pattern_body, time_range, start_offset, end_offset, symtable, store
+    )
 
-        if remote_pattern:
-            data_source = symtable[input_var_name].data_source
-            resp = ds_manager.query(data_source, remote_pattern, session_id)
-            query_id = resp.load_to_store(store)
+    if remote_pattern:
+        data_source = symtable[input_var_name].data_source
+        resp = ds_manager.query(data_source, remote_pattern, session_id)
+        query_id = resp.load_to_store(store)
 
-            # build the return_var_name view in store
-            store.extract(return_var_name, return_type, None, remote_pattern)
+        # build the return_var_name view in store
+        store.extract(return_var_name, return_type, query_id, remote_pattern)
 
-            return remote_pattern
+        return return_var_name
 
     return None
diff --git a/src/kestrel/codegen/display.py b/src/kestrel/codegen/display.py
@@ -1,6 +1,5 @@
 from abc import ABC, abstractmethod
 from pandas import DataFrame
-from numpy import nan
 import json
 
 from kestrel.exceptions import KestrelInternalError

diff --git a/src/kestrel/codegen/pattern.py b/src/kestrel/codegen/pattern.py
@@ -1,4 +1,3 @@
-import itertools
 import dateutil.parser
 import datetime
 import logging

diff --git a/src/kestrel/codegen/relations.py b/src/kestrel/codegen/relations.py
@@ -8,6 +8,8 @@
 
 import logging
 
+from firepit.query import Query, Projection, Table, Unique
+
 _logger = logging.getLogger(__name__)
 
 stix_2_0_ref_mapping = {
@@ -61,8 +63,9 @@
     ("windows-service-ext", "loaded", "user-account"): (["creator_user_ref"], True),
 }
 
+# the first available attribute will be used to uniquely identify the entity
 stix_2_0_identical_mapping = {
-    # entity-type: combination of attributes used for identical entity lookup
+    # entity-type: id attributes candidates
     "directory": ("path",),
     "domain-name": ("value",),
     "email-addr": ("value",),
@@ -74,7 +77,7 @@
     # `pid` is optional in STIX standard
     # `first_observed` cannot be used since it may be wrong (derived from observation)
     # `command_line` or `name` may not be in data and cannot be used
-    "process": ("pid",),
+    "process": ("pid", "name"),
     "software": ("name",),
     "url": ("value",),
     "user-account": ("user_id",),  # optional in STIX standard
@@ -104,6 +107,31 @@
 )
 
 
+def get_entity_id_attribute(variable):
+    # this function should always return something
+    # if no entity id attribute found, fall back to record "id" by default
+    # this works for:
+    #   - no appriparite identifier attribute found given specific data
+    #   - "network-traffic" (not in stix_2_0_identical_mapping)
+    id_attr = "id"
+
+    if variable.type in stix_2_0_identical_mapping:
+        available_attributes = variable.store.columns(variable.entity_table)
+        for attr in stix_2_0_identical_mapping[variable.type]:
+            if attr in available_attributes:
+                query = Query()
+                query.append(Table(variable.entity_table))
+                query.append(Projection([attr]))
+                query.append(Unique())
+                rows = variable.store.run_query(query).fetchall()
+                all_values = [row[attr] for row in rows if row[attr]]
+                if all_values:
+                    id_attr = attr
+                    break
+
+    return id_attr
+
+
 def are_entities_associated_with_x_ibm_event(entity_types):
     flags = [entity_type in stix_x_ibm_event_mapping for entity_type in entity_types]
     return all(flags)
@@ -133,15 +161,10 @@ def compile_specific_relation_to_pattern(
     return pattern
 
 
-def compile_identical_entity_search_pattern(entity_type, var_name):
-    comp_exps = []
-    if entity_type in stix_2_0_identical_mapping:
-        for attribute in stix_2_0_identical_mapping[entity_type]:
-            comp_exps.append(f"{entity_type}:{attribute} = {var_name}.{attribute}")
-        pattern = "[" + " AND ".join(comp_exps) + "]"
-        _logger.debug(f"identical entity search pattern compiled: {pattern}")
-    else:
-        pattern = None
+def compile_identical_entity_search_pattern(var_name, var_struct):
+    attribute = get_entity_id_attribute(var_struct)
+    pattern = f"[{var_struct.type}:{attribute} = {var_name}.{attribute}]"
+    _logger.debug(f"identical entity search pattern compiled: {pattern}")
     return pattern
 
 

diff --git a/src/kestrel/codegen/summary.py b/src/kestrel/codegen/summary.py
@@ -9,7 +9,7 @@
     Join,
 )
 from collections import OrderedDict
-from kestrel.codegen.relations import all_entity_types, stix_2_0_identical_mapping
+from kestrel.codegen.relations import all_entity_types, get_entity_id_attribute
 from kestrel.exceptions import KestrelInternalError
 
 
@@ -35,12 +35,19 @@ def gen_variable_summary(var_name, var_struct):
 
     query_ids = _get_variable_query_ids(var_struct)
 
+    is_from_direct_datasource = False
+    var_birth_cmd = var_struct.birth_statement["command"]
+    if var_birth_cmd == "find" or (
+        var_birth_cmd == "get" and "datasource" in var_struct.birth_statement
+    ):
+        is_from_direct_datasource = True
+
     for table in var_struct.store.tables():
 
         if table in all_entity_types:
             count = 0
 
-            if query_ids:
+            if query_ids and is_from_direct_datasource:
                 query_ids_filter = Filter([Predicate("query_id", "IN", query_ids)])
                 query = Query()
                 query.append(Table(table))
@@ -65,8 +72,7 @@ def _get_variable_query_ids(variable):
     if variable.entity_table:
         query = Query()
         query.append(Table("__queries"))
-        query.append(Join("__membership", "sco_id", "=", "sco_id"))
-        query.append(Filter([Predicate("var", "=", variable.entity_table)]))
+        query.append(Join(variable.entity_table, "sco_id", "=", "id"))
         query.append(Projection(["query_id"]))
         query.append(Unique())
         rows = variable.store.run_query(query).fetchall()
@@ -80,12 +86,8 @@ def get_variable_entity_count(variable):
     if variable.entity_table:
         query = Query()
         query.append(Table(variable.entity_table))
-        cols = (
-            stix_2_0_identical_mapping[variable.type]
-            if variable.type in stix_2_0_identical_mapping
-            else ["id"]
-        )
-        query.append(Projection(cols))
+        entity_id_attr = get_entity_id_attribute(variable)
+        query.append(Projection([entity_id_attr]))
         query.append(Unique())
         query.append(Count())
         rows = variable.store.run_query(query).fetchall()

diff --git a/src/kestrel_analytics_docker/interface.py b/src/kestrel_analytics_docker/interface.py
@@ -25,7 +25,6 @@
 import docker
 import logging
 import pandas
-import pickle
 
 from kestrel.analytics import AbstractAnalyticsInterface
 from kestrel.exceptions import (

diff --git a/tests/test_command_find.py b/tests/test_command_find.py
@@ -22,7 +22,6 @@ def test_return_table_not_exist(fake_bundle_file):
     correct_dict = {
         "display": "execution summary",
         "data": {
-            "execution time": 1,
             "variables updated": [
                 {
                     "VARIABLE": "conns",
@@ -47,4 +46,5 @@ def test_return_table_not_exist(fake_bundle_file):
         },
     }
     output_dict = summaries[0].to_dict()
+    del output_dict["data"]["execution time"]
     assert output_dict == correct_dict