Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vmray: skip non-printable strings #2551

Merged
merged 2 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

- vmray: load more analysis archives @mr-tz
- dynamic: only check file limitations for static file formats @mr-tz
- vmray: skip non-printable strings @mike-hunhoff

### capa Explorer Web

Expand Down
6 changes: 6 additions & 0 deletions capa/features/extractors/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# See the License for the specific language governing permissions and limitations under the License.

import re
import string
import contextlib
from collections import namedtuple

Expand All @@ -19,6 +20,7 @@
UNICODE_RE_4 = re.compile(b"((?:[%s]\x00){%d,})" % (ASCII_BYTE, 4))
REPEATS = [b"A", b"\x00", b"\xfe", b"\xff"]
SLICE_SIZE = 4096
PRINTABLE_CHAR_SET = set(string.printable)

String = namedtuple("String", ["s", "offset"])

Expand Down Expand Up @@ -84,3 +86,7 @@ def extract_unicode_strings(buf, n=4):
for match in r.finditer(buf):
with contextlib.suppress(UnicodeDecodeError):
yield String(match.group().decode("utf-16"), match.start())


def is_printable_str(s: str) -> bool:
return set(s).issubset(PRINTABLE_CHAR_SET)
9 changes: 4 additions & 5 deletions capa/features/extractors/vmray/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from capa.features.insn import API, Number
from capa.features.common import String, Feature
from capa.features.address import Address
from capa.features.extractors.strings import is_printable_str
from capa.features.extractors.vmray.models import PARAM_TYPE_INT, PARAM_TYPE_STR, Param, FunctionCall, hexint
from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle

Expand All @@ -27,11 +28,9 @@ def get_call_param_features(param: Param, ch: CallHandle) -> Iterator[tuple[Feat
if param.deref.type_ in PARAM_TYPE_INT:
yield Number(hexint(param.deref.value)), ch.address
elif param.deref.type_ in PARAM_TYPE_STR:
# TODO(mr-tz): remove FPS like " \\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\x09\\x0a\\x0b\\x0c\\x0d\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a\\x1b\\x1c\\x1d\\x1e\..."
# https://github.com/mandiant/capa/issues/2432

# parsing the data up to here results in double-escaped backslashes, remove those here
yield String(param.deref.value.replace("\\\\", "\\")), ch.address
if is_printable_str(param.deref.value):
# parsing the data up to here results in double-escaped backslashes, remove those here
yield String(param.deref.value.replace("\\\\", "\\")), ch.address
else:
logger.debug("skipping deref param type %s", param.deref.type_)
elif param.value is not None:
Expand Down
Loading