mandiant · svm1048 · Mar 3, 2026 · gemini-code-assist · Mar 3, 2026 · gemini-code-assist
diff --git a/floss/language/rust/extract.py b/floss/language/rust/extract.py
@@ -17,11 +17,11 @@
 import pathlib
 import argparse
 import itertools
-from typing import List, Tuple, Iterable, Optional
+from typing import List, Iterable
 
 import pefile
-import binary2strings as b2s
 
+from .utf8_strings import extract_utf8_strings
 from floss.results import StaticString, StringEncoding
 from floss.language.utils import (
     find_lea_xrefs,
@@ -36,62 +36,6 @@
 MIN_STR_LEN = 4
 
 
-def fix_b2s_wide_strings(
-    strings: List[Tuple[str, str, Tuple[int, int], bool]], min_length: int, buffer: bytes
-) -> List[Tuple[str, str, Tuple[int, int], bool]]:
-    # TODO(mr-tz): b2s may parse wide strings where there really should be utf-8 strings
-    #  handle special cases here until fixed
-    #  https://github.com/mandiant/flare-floss/issues/867
-    fixed_strings: List[Tuple[str, str, Tuple[int, int], bool]] = list()
-    last_fixup: Optional[Tuple[str, str, Tuple[int, int], bool]] = None
-    for string in strings:
-        s = string[0]
-        string_type = string[1]
-        start = string[2][0]
-
-        if string_type == "WIDE_STRING":
-            sd = s.encode("utf-16le", "ignore")
-            # utf-8 strings will not start with \x00
-            if sd[0] == 0:
-                new_string = b2s.extract_string(buffer[start + 1 :])
-                last_fixup = (
-                    new_string[0],
-                    new_string[1],
-                    (new_string[2][0] + start + 1, new_string[2][1] + start + 1),
-                    new_string[3],
-                )
-                if len(last_fixup[0]) < min_length:
-                    last_fixup = None
-        else:
-            if last_fixup and s in last_fixup[0]:
-                fixed_strings.append(last_fixup)
-            else:
-                fixed_strings.append(string)
-            last_fixup = None
-    return fixed_strings
-
-
-def filter_and_transform_utf8_strings(
-    strings: List[Tuple[str, str, Tuple[int, int], bool]],
-    start_rdata: int,
-) -> List[StaticString]:
-    transformed_strings = []
-
-    for string in strings:
-        s = string[0]
-        string_type = string[1]
-        start = string[2][0] + start_rdata
-
-        if string_type != "UTF8":
-            continue
-
-        # our static algorithm does not extract new lines either
-        s = s.replace("\n", "")
-        transformed_strings.append(StaticString(string=s, offset=start, encoding=StringEncoding.UTF8))
-
-    return transformed_strings
-
-
 def split_strings(static_strings: List[StaticString], address: int, min_length: int) -> None:
     """
     if address is in between start and end of a string in ref data then split the string
@@ -163,12 +107,20 @@ def get_string_blob_strings(pe: pefile.PE, min_length: int) -> Iterable[StaticSt
     pointer_to_raw_data = rdata_section.PointerToRawData
     buffer_rdata = rdata_section.get_data()
 
-    # extract utf-8 and wide strings, latter not needed here
-    strings = b2s.extract_all_strings(buffer_rdata, min_length)
-    fixed_strings = fix_b2s_wide_strings(strings, min_length, buffer_rdata)
-
-    # select only UTF-8 strings and adjust offset
-    static_strings = filter_and_transform_utf8_strings(fixed_strings, start_rdata)
+    # Extract strictly valid UTF-8 strings using our custom regex implementation
+    # This automatically prevents the b2s wide-string garbage extraction bug
+    static_strings: List[StaticString] = []
+    for offset, string_val in extract_utf8_strings(buffer_rdata, min_length=min_length):
+        # our static algorithm does not extract new lines either
+        clean_str = string_val.replace("\n", "")
+
+        static_strings.append(
+            StaticString(
+                string=clean_str, 
+                offset=offset + start_rdata, 
+                encoding=StringEncoding.UTF8
+            )
+        )
 
     # TODO(mr-tz) - handle miss in rust-hello64.exe
     #  .rdata:00000001400C1270 0A                      aPanickedAfterP db 0Ah                  ; DATA XREF: .rdata:00000001400C12B8↓o
@@ -228,4 +180,4 @@ def main(argv=None):
 
 
 if __name__ == "__main__":
-    sys.exit(main())
+    sys.exit(main())
diff --git a/floss/language/rust/strings.py b/floss/language/rust/strings.py
@@ -0,0 +1,60 @@
+import re
+from typing import Iterator, Tuple
+
+'''This regular expression is strictly designed to match printable ASCII 
+and valid UTF-8 multi-byte sequences.''' 
+
+UTF8_PRINTABLE_PATTERN = re.compile(
+    b'(?:'
+    b'[\x20-\x7E\t\r\n]'                  # 1-byte: ASCII printable characters and standard whitespace
+    b'|[\xC2-\xDF][\x80-\xBF]'            # 2-byte: Valid UTF-8 sequence
+    b'|\xE0[\xA0-\xBF][\x80-\xBF]'        # 3-byte: Valid UTF-8 sequence
+    b'|[\xE1-\xEC][\x80-\xBF]{2}'         # 3-byte: Valid UTF-8 sequence
+    b'|\xED[\x80-\x9F][\x80-\xBF]'        # 3-byte: Valid UTF-8 sequence
+    b'|[\xEE-\xEF][\x80-\xBF]{2}'         # 3-byte: Valid UTF-8 sequence
+    b'|\xF0[\x90-\xBF][\x80-\xBF]{2}'     # 4-byte: Valid UTF-8 sequence
+    b'|[\xF1-\xF3][\x80-\xBF]{3}'         # 4-byte: Valid UTF-8 sequence
+    b'|\xF4[\x80-\x8F][\x80-\xBF]{2}'     # 4-byte: Valid UTF-8 sequence
+    b')+'
+)
+
+def extract_utf8_strings(buf: bytes, min_length: int = 4) -> Iterator[Tuple[int, str]]:
+    """
+    Scans a byte buffer and yields strictly valid, printable UTF-8 strings.
+    Ignores UTF-16/wide strings completely to prevent garbage extraction in Rust binaries.
+
+    Args:
+        buf (bytes): The raw binary data to scan.
+        min_length (int): The minimum character length for a valid string. Default is 4.
+
+    Yields:
+        Tuple[int, str]: A tuple containing the starting byte offset and the decoded string.
+    """
+    for match in UTF8_PRINTABLE_PATTERN.finditer(buf):
+        string_bytes = match.group(0)
+
+        try:
+            # Decode the matched bytes into a standard Python string
+            decoded_string = string_bytes.decode('utf-8')
+
+            # Check if the length meets the minimum threshold
+            if len(decoded_string) >= min_length:
+                yield (match.start(), decoded_string)
+
+        except UnicodeDecodeError:
+            # Safely continue if an edge-case sequence bypasses the regex 
+            # but fails the strict Python decoder.
+            continue
+
+if __name__ == "__main__":
+    # Quick sanity check / localized test
+    test_buffer = (
+        b"Garbage\x00\x00\x00"              # Should be ignored or split
+        b"Valid_UTF8_String\x00"            # Should extract "Valid_UTF8_String"
+        b"W\x00i\x00d\x00e\x00"             # UTF-16 wide string: Should be completely ignored
+        b"\xE2\x9C\x93_Checkmark\x00"       # Should extract "✓_Checkmark"
+    )
+
+    print("Extracting strings from test buffer...")
+    for offset, extracted_str in extract_utf8_strings(test_buffer, min_length=4):
+        print(f"Offset: {hex(offset)} | String: '{extracted_str}'")