-
Notifications
You must be signed in to change notification settings - Fork 524
Fix b2s wide string garbage extraction in Rust binaries issue #867 #1223
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,11 +17,11 @@ | |
| import pathlib | ||
| import argparse | ||
| import itertools | ||
| from typing import List, Tuple, Iterable, Optional | ||
| from typing import List, Iterable | ||
|
|
||
| import pefile | ||
| import binary2strings as b2s | ||
|
|
||
| from .utf8_strings import extract_utf8_strings | ||
| from floss.results import StaticString, StringEncoding | ||
| from floss.language.utils import ( | ||
| find_lea_xrefs, | ||
|
|
@@ -36,62 +36,6 @@ | |
| MIN_STR_LEN = 4 | ||
|
|
||
|
|
||
| def fix_b2s_wide_strings( | ||
| strings: List[Tuple[str, str, Tuple[int, int], bool]], min_length: int, buffer: bytes | ||
| ) -> List[Tuple[str, str, Tuple[int, int], bool]]: | ||
| # TODO(mr-tz): b2s may parse wide strings where there really should be utf-8 strings | ||
| # handle special cases here until fixed | ||
| # https://github.com/mandiant/flare-floss/issues/867 | ||
| fixed_strings: List[Tuple[str, str, Tuple[int, int], bool]] = list() | ||
| last_fixup: Optional[Tuple[str, str, Tuple[int, int], bool]] = None | ||
| for string in strings: | ||
| s = string[0] | ||
| string_type = string[1] | ||
| start = string[2][0] | ||
|
|
||
| if string_type == "WIDE_STRING": | ||
| sd = s.encode("utf-16le", "ignore") | ||
| # utf-8 strings will not start with \x00 | ||
| if sd[0] == 0: | ||
| new_string = b2s.extract_string(buffer[start + 1 :]) | ||
| last_fixup = ( | ||
| new_string[0], | ||
| new_string[1], | ||
| (new_string[2][0] + start + 1, new_string[2][1] + start + 1), | ||
| new_string[3], | ||
| ) | ||
| if len(last_fixup[0]) < min_length: | ||
| last_fixup = None | ||
| else: | ||
| if last_fixup and s in last_fixup[0]: | ||
| fixed_strings.append(last_fixup) | ||
| else: | ||
| fixed_strings.append(string) | ||
| last_fixup = None | ||
| return fixed_strings | ||
|
|
||
|
|
||
| def filter_and_transform_utf8_strings( | ||
| strings: List[Tuple[str, str, Tuple[int, int], bool]], | ||
| start_rdata: int, | ||
| ) -> List[StaticString]: | ||
| transformed_strings = [] | ||
|
|
||
| for string in strings: | ||
| s = string[0] | ||
| string_type = string[1] | ||
| start = string[2][0] + start_rdata | ||
|
|
||
| if string_type != "UTF8": | ||
| continue | ||
|
|
||
| # our static algorithm does not extract new lines either | ||
| s = s.replace("\n", "") | ||
| transformed_strings.append(StaticString(string=s, offset=start, encoding=StringEncoding.UTF8)) | ||
|
|
||
| return transformed_strings | ||
|
|
||
|
|
||
| def split_strings(static_strings: List[StaticString], address: int, min_length: int) -> None: | ||
| """ | ||
| if address is in between start and end of a string in ref data then split the string | ||
|
|
@@ -163,12 +107,20 @@ def get_string_blob_strings(pe: pefile.PE, min_length: int) -> Iterable[StaticSt | |
| pointer_to_raw_data = rdata_section.PointerToRawData | ||
| buffer_rdata = rdata_section.get_data() | ||
|
|
||
| # extract utf-8 and wide strings, latter not needed here | ||
| strings = b2s.extract_all_strings(buffer_rdata, min_length) | ||
| fixed_strings = fix_b2s_wide_strings(strings, min_length, buffer_rdata) | ||
|
|
||
| # select only UTF-8 strings and adjust offset | ||
| static_strings = filter_and_transform_utf8_strings(fixed_strings, start_rdata) | ||
| # Extract strictly valid UTF-8 strings using our custom regex implementation | ||
| # This automatically prevents the b2s wide-string garbage extraction bug | ||
| static_strings: List[StaticString] = [] | ||
| for offset, string_val in extract_utf8_strings(buffer_rdata, min_length=min_length): | ||
| # our static algorithm does not extract new lines either | ||
| clean_str = string_val.replace("\n", "") | ||
|
|
||
| static_strings.append( | ||
| StaticString( | ||
| string=clean_str, | ||
| offset=offset + start_rdata, | ||
| encoding=StringEncoding.UTF8 | ||
| ) | ||
| ) | ||
|
|
||
| # TODO(mr-tz) - handle miss in rust-hello64.exe | ||
| # .rdata:00000001400C1270 0A aPanickedAfterP db 0Ah ; DATA XREF: .rdata:00000001400C12B8↓o | ||
|
|
@@ -228,4 +180,4 @@ def main(argv=None): | |
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| sys.exit(main()) | ||
| sys.exit(main()) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The file is missing a final newline character. It's a standard convention (part of the POSIX standard for text files) to end files with a newline. Some tools may not process the last line correctly if it's not terminated by a newline. Please add a newline at the end of the file for compatibility and consistency. sys.exit(main()) |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,60 @@ | ||
| import re | ||
| from typing import Iterator, Tuple | ||
|
|
||
| '''This regular expression is strictly designed to match printable ASCII | ||
| and valid UTF-8 multi-byte sequences.''' | ||
|
Comment on lines
+1
to
+5
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The multi-line string on lines 4-5 should be a module-level docstring placed at the top of the file, as per PEP 257. This improves code readability and allows documentation-generation tools to correctly parse the file's purpose. """This regular expression is strictly designed to match printable ASCII and valid UTF-8 multi-byte sequences."""
import re
from typing import Iterator, Tuple |
||
|
|
||
| UTF8_PRINTABLE_PATTERN = re.compile( | ||
| b'(?:' | ||
| b'[\x20-\x7E\t\r\n]' # 1-byte: ASCII printable characters and standard whitespace | ||
| b'|[\xC2-\xDF][\x80-\xBF]' # 2-byte: Valid UTF-8 sequence | ||
| b'|\xE0[\xA0-\xBF][\x80-\xBF]' # 3-byte: Valid UTF-8 sequence | ||
| b'|[\xE1-\xEC][\x80-\xBF]{2}' # 3-byte: Valid UTF-8 sequence | ||
| b'|\xED[\x80-\x9F][\x80-\xBF]' # 3-byte: Valid UTF-8 sequence | ||
| b'|[\xEE-\xEF][\x80-\xBF]{2}' # 3-byte: Valid UTF-8 sequence | ||
| b'|\xF0[\x90-\xBF][\x80-\xBF]{2}' # 4-byte: Valid UTF-8 sequence | ||
| b'|[\xF1-\xF3][\x80-\xBF]{3}' # 4-byte: Valid UTF-8 sequence | ||
| b'|\xF4[\x80-\x8F][\x80-\xBF]{2}' # 4-byte: Valid UTF-8 sequence | ||
| b')+' | ||
| ) | ||
|
|
||
| def extract_utf8_strings(buf: bytes, min_length: int = 4) -> Iterator[Tuple[int, str]]: | ||
| """ | ||
| Scans a byte buffer and yields strictly valid, printable UTF-8 strings. | ||
| Ignores UTF-16/wide strings completely to prevent garbage extraction in Rust binaries. | ||
|
|
||
| Args: | ||
| buf (bytes): The raw binary data to scan. | ||
| min_length (int): The minimum character length for a valid string. Default is 4. | ||
|
|
||
| Yields: | ||
| Tuple[int, str]: A tuple containing the starting byte offset and the decoded string. | ||
| """ | ||
| for match in UTF8_PRINTABLE_PATTERN.finditer(buf): | ||
| string_bytes = match.group(0) | ||
|
|
||
| try: | ||
| # Decode the matched bytes into a standard Python string | ||
| decoded_string = string_bytes.decode('utf-8') | ||
|
|
||
| # Check if the length meets the minimum threshold | ||
| if len(decoded_string) >= min_length: | ||
| yield (match.start(), decoded_string) | ||
|
|
||
| except UnicodeDecodeError: | ||
| # Safely continue if an edge-case sequence bypasses the regex | ||
| # but fails the strict Python decoder. | ||
| continue | ||
|
|
||
| if __name__ == "__main__": | ||
| # Quick sanity check / localized test | ||
| test_buffer = ( | ||
| b"Garbage\x00\x00\x00" # Should be ignored or split | ||
| b"Valid_UTF8_String\x00" # Should extract "Valid_UTF8_String" | ||
| b"W\x00i\x00d\x00e\x00" # UTF-16 wide string: Should be completely ignored | ||
| b"\xE2\x9C\x93_Checkmark\x00" # Should extract "✓_Checkmark" | ||
| ) | ||
|
|
||
| print("Extracting strings from test buffer...") | ||
| for offset, extracted_str in extract_utf8_strings(test_buffer, min_length=4): | ||
| print(f"Offset: {hex(offset)} | String: '{extracted_str}'") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The file is missing a final newline character. It's a standard convention (part of the POSIX standard for text files) to end files with a newline. Some tools may not process the last line correctly if it's not terminated by a newline. Please add a newline at the end of the file for compatibility and consistency. print(f"Offset: {hex(offset)} | String: '{extracted_str}'") |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The loop for building
static_stringscan be simplified into a more concise and Pythonic list comprehension. This improves readability and maintainability.