mandiant · mike-hunhoff · Jan 8, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@
 
 - vmray: load more analysis archives @mr-tz
 - dynamic: only check file limitations for static file formats @mr-tz
+- vmray: skip non-printable strings @mike-hunhoff
 
 ### capa Explorer Web
 

diff --git a/capa/features/extractors/strings.py b/capa/features/extractors/strings.py
@@ -9,6 +9,7 @@
 # See the License for the specific language governing permissions and limitations under the License.
 
 import re
+import string
 import contextlib
 from collections import namedtuple
 
@@ -19,6 +20,7 @@
 UNICODE_RE_4 = re.compile(b"((?:[%s]\x00){%d,})" % (ASCII_BYTE, 4))
 REPEATS = [b"A", b"\x00", b"\xfe", b"\xff"]
 SLICE_SIZE = 4096
+PRINTABLE_CHAR_SET = set(string.printable)
 
 String = namedtuple("String", ["s", "offset"])
 
@@ -84,3 +86,7 @@ def extract_unicode_strings(buf, n=4):
     for match in r.finditer(buf):
         with contextlib.suppress(UnicodeDecodeError):
             yield String(match.group().decode("utf-16"), match.start())
+
+
+def is_printable_str(s: str) -> bool:
+    return set(s).issubset(PRINTABLE_CHAR_SET)
diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py
@@ -12,6 +12,7 @@
 from capa.features.insn import API, Number
 from capa.features.common import String, Feature
 from capa.features.address import Address
+from capa.features.extractors.strings import is_printable_str
 from capa.features.extractors.vmray.models import PARAM_TYPE_INT, PARAM_TYPE_STR, Param, FunctionCall, hexint
 from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle
 
@@ -27,11 +28,9 @@ def get_call_param_features(param: Param, ch: CallHandle) -> Iterator[tuple[Feat
             if param.deref.type_ in PARAM_TYPE_INT:
                 yield Number(hexint(param.deref.value)), ch.address
             elif param.deref.type_ in PARAM_TYPE_STR:
-                # TODO(mr-tz): remove FPS like " \\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\x09\\x0a\\x0b\\x0c\\x0d\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a\\x1b\\x1c\\x1d\\x1e\..."
-                # https://github.com/mandiant/capa/issues/2432
-
-                # parsing the data up to here results in double-escaped backslashes, remove those here
-                yield String(param.deref.value.replace("\\\\", "\\")), ch.address
+                if is_printable_str(param.deref.value):
+                    # parsing the data up to here results in double-escaped backslashes, remove those here
+                    yield String(param.deref.value.replace("\\\\", "\\")), ch.address
             else:
                 logger.debug("skipping deref param type %s", param.deref.type_)
     elif param.value is not None: