Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 17 additions & 65 deletions floss/language/rust/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
import pathlib
import argparse
import itertools
from typing import List, Tuple, Iterable, Optional
from typing import List, Iterable

import pefile
import binary2strings as b2s

from .utf8_strings import extract_utf8_strings
from floss.results import StaticString, StringEncoding
from floss.language.utils import (
find_lea_xrefs,
Expand All @@ -36,62 +36,6 @@
MIN_STR_LEN = 4


def fix_b2s_wide_strings(
strings: List[Tuple[str, str, Tuple[int, int], bool]], min_length: int, buffer: bytes
) -> List[Tuple[str, str, Tuple[int, int], bool]]:
# TODO(mr-tz): b2s may parse wide strings where there really should be utf-8 strings
# handle special cases here until fixed
# https://github.com/mandiant/flare-floss/issues/867
fixed_strings: List[Tuple[str, str, Tuple[int, int], bool]] = list()
last_fixup: Optional[Tuple[str, str, Tuple[int, int], bool]] = None
for string in strings:
s = string[0]
string_type = string[1]
start = string[2][0]

if string_type == "WIDE_STRING":
sd = s.encode("utf-16le", "ignore")
# utf-8 strings will not start with \x00
if sd[0] == 0:
new_string = b2s.extract_string(buffer[start + 1 :])
last_fixup = (
new_string[0],
new_string[1],
(new_string[2][0] + start + 1, new_string[2][1] + start + 1),
new_string[3],
)
if len(last_fixup[0]) < min_length:
last_fixup = None
else:
if last_fixup and s in last_fixup[0]:
fixed_strings.append(last_fixup)
else:
fixed_strings.append(string)
last_fixup = None
return fixed_strings


def filter_and_transform_utf8_strings(
strings: List[Tuple[str, str, Tuple[int, int], bool]],
start_rdata: int,
) -> List[StaticString]:
transformed_strings = []

for string in strings:
s = string[0]
string_type = string[1]
start = string[2][0] + start_rdata

if string_type != "UTF8":
continue

# our static algorithm does not extract new lines either
s = s.replace("\n", "")
transformed_strings.append(StaticString(string=s, offset=start, encoding=StringEncoding.UTF8))

return transformed_strings


def split_strings(static_strings: List[StaticString], address: int, min_length: int) -> None:
"""
if address is in between start and end of a string in ref data then split the string
Expand Down Expand Up @@ -163,12 +107,20 @@ def get_string_blob_strings(pe: pefile.PE, min_length: int) -> Iterable[StaticSt
pointer_to_raw_data = rdata_section.PointerToRawData
buffer_rdata = rdata_section.get_data()

# extract utf-8 and wide strings, latter not needed here
strings = b2s.extract_all_strings(buffer_rdata, min_length)
fixed_strings = fix_b2s_wide_strings(strings, min_length, buffer_rdata)

# select only UTF-8 strings and adjust offset
static_strings = filter_and_transform_utf8_strings(fixed_strings, start_rdata)
# Extract strictly valid UTF-8 strings using our custom regex implementation
# This automatically prevents the b2s wide-string garbage extraction bug
static_strings: List[StaticString] = []
for offset, string_val in extract_utf8_strings(buffer_rdata, min_length=min_length):
# our static algorithm does not extract new lines either
clean_str = string_val.replace("\n", "")

static_strings.append(
StaticString(
string=clean_str,
offset=offset + start_rdata,
encoding=StringEncoding.UTF8
)
)
Comment on lines +112 to +123
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The loop for building static_strings can be simplified into a more concise and Pythonic list comprehension. This improves readability and maintainability.

    static_strings: List[StaticString] = [
        StaticString(
            string=string_val.replace("\n", ""),
            offset=offset + start_rdata,
            encoding=StringEncoding.UTF8,
        )
        for offset, string_val in extract_utf8_strings(buffer_rdata, min_length=min_length)
    ]


# TODO(mr-tz) - handle miss in rust-hello64.exe
# .rdata:00000001400C1270 0A aPanickedAfterP db 0Ah ; DATA XREF: .rdata:00000001400C12B8↓o
Expand Down Expand Up @@ -228,4 +180,4 @@ def main(argv=None):


if __name__ == "__main__":
sys.exit(main())
sys.exit(main())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The file is missing a final newline character. It's a standard convention (part of the POSIX standard for text files) to end files with a newline. Some tools may not process the last line correctly if it's not terminated by a newline. Please add a newline at the end of the file for compatibility and consistency.

    sys.exit(main())

60 changes: 60 additions & 0 deletions floss/language/rust/strings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import re
from typing import Iterator, Tuple

'''This regular expression is strictly designed to match printable ASCII
and valid UTF-8 multi-byte sequences.'''
Comment on lines +1 to +5
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The multi-line string on lines 4-5 should be a module-level docstring placed at the top of the file, as per PEP 257. This improves code readability and allows documentation-generation tools to correctly parse the file's purpose.

"""This regular expression is strictly designed to match printable ASCII and valid UTF-8 multi-byte sequences."""

import re
from typing import Iterator, Tuple


UTF8_PRINTABLE_PATTERN = re.compile(
b'(?:'
b'[\x20-\x7E\t\r\n]' # 1-byte: ASCII printable characters and standard whitespace
b'|[\xC2-\xDF][\x80-\xBF]' # 2-byte: Valid UTF-8 sequence
b'|\xE0[\xA0-\xBF][\x80-\xBF]' # 3-byte: Valid UTF-8 sequence
b'|[\xE1-\xEC][\x80-\xBF]{2}' # 3-byte: Valid UTF-8 sequence
b'|\xED[\x80-\x9F][\x80-\xBF]' # 3-byte: Valid UTF-8 sequence
b'|[\xEE-\xEF][\x80-\xBF]{2}' # 3-byte: Valid UTF-8 sequence
b'|\xF0[\x90-\xBF][\x80-\xBF]{2}' # 4-byte: Valid UTF-8 sequence
b'|[\xF1-\xF3][\x80-\xBF]{3}' # 4-byte: Valid UTF-8 sequence
b'|\xF4[\x80-\x8F][\x80-\xBF]{2}' # 4-byte: Valid UTF-8 sequence
b')+'
)

def extract_utf8_strings(buf: bytes, min_length: int = 4) -> Iterator[Tuple[int, str]]:
"""
Scans a byte buffer and yields strictly valid, printable UTF-8 strings.
Ignores UTF-16/wide strings completely to prevent garbage extraction in Rust binaries.

Args:
buf (bytes): The raw binary data to scan.
min_length (int): The minimum character length for a valid string. Default is 4.

Yields:
Tuple[int, str]: A tuple containing the starting byte offset and the decoded string.
"""
for match in UTF8_PRINTABLE_PATTERN.finditer(buf):
string_bytes = match.group(0)

try:
# Decode the matched bytes into a standard Python string
decoded_string = string_bytes.decode('utf-8')

# Check if the length meets the minimum threshold
if len(decoded_string) >= min_length:
yield (match.start(), decoded_string)

except UnicodeDecodeError:
# Safely continue if an edge-case sequence bypasses the regex
# but fails the strict Python decoder.
continue

if __name__ == "__main__":
# Quick sanity check / localized test
test_buffer = (
b"Garbage\x00\x00\x00" # Should be ignored or split
b"Valid_UTF8_String\x00" # Should extract "Valid_UTF8_String"
b"W\x00i\x00d\x00e\x00" # UTF-16 wide string: Should be completely ignored
b"\xE2\x9C\x93_Checkmark\x00" # Should extract "✓_Checkmark"
)

print("Extracting strings from test buffer...")
for offset, extracted_str in extract_utf8_strings(test_buffer, min_length=4):
print(f"Offset: {hex(offset)} | String: '{extracted_str}'")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The file is missing a final newline character. It's a standard convention (part of the POSIX standard for text files) to end files with a newline. Some tools may not process the last line correctly if it's not terminated by a newline. Please add a newline at the end of the file for compatibility and consistency.

        print(f"Offset: {hex(offset)} | String: '{extracted_str}'")