Skip to main content
Glama
scrubber.py8.46 kB
# Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under both the MIT license found in the # LICENSE-MIT file in the root directory of this source tree and the Apache # License, Version 2.0 found in the LICENSE-APACHE file in the root directory # of this source tree. # pyre-strict import json import os import shutil import subprocess from typing import BinaryIO, Callable, List, Optional, Set, Tuple from apple.tools.re_compatibility_utils.writable import make_path_user_writable from .macho import Symbol from .macho_parser import load_commands, load_debug_symbols, load_header from .spec import Spec from .utils import MachOException FAKE_PATH = b"fake/path" # buck-out/isolation_dir/gen/project_cell/{hash}/.... NUM_OF_COMPONENTS_IN_BUCK2_OUTPUT_PATH_BEFORE_PROJECT_PATH = 5 def _always_scrub(_: str) -> bool: return True # Visible for testing def load_focused_targets_output_paths(json_file_path: str) -> Set[str]: if json_file_path is None or not os.path.exists(json_file_path): return set() with open(json_file_path, "r") as f: content = f.read() if not content: return set() data = json.loads(content) output_paths = set() for target in data["targets"]: _, package_and_name = target.split("//") package, name = package_and_name.split(":") # This assumes the output path created by buck2, which if # modified, would break this logic. output_directory = f"{package}/__{name}__" output_paths.add(output_directory) return output_paths # This function converts buck-out/isolation_dir/gen/project_cell/{hash}/X/Y/__name__/libFoo.a # into X/Y/__name__ to match the focus target output path created by load_focused_targets_output_paths # Visible for testing def _get_target_output_path_from_debug_file_path( debug_target_path: str, ) -> str: # This function assumes the debug file path created by buck2 in one of the following formats: # buck-out/isolation_dir/gen/project_cell/{hash}/.../__name__/libFoo.a # buck-out/isolation_dir/gen/project_cell/{hash}/.../__name__/__objects__/bar.o # buck-out/isolation_dir/gen/project_cell/{hash}/.../__name__/swift_object_file.o parts = debug_target_path.split("/") # We are doing the traverse in reverse order because this way we'll find the first # target directory sooner. _should_scrub can get called many times, so it's # important that we make it as efficient as possible. i = 1 while i <= len(parts): # We are looking for the path component that matches __name__ in the examples above, and if we don't explicitly skip __objects__, we'll get the wrong path if ( parts[-i].startswith("__") and parts[-i].endswith("__") and parts[-i] != "__objects__" ): break i += 1 if i > len(parts): raise Exception( f"Unrecognized format for debug file path : {debug_target_path}" ) return "/".join( parts[NUM_OF_COMPONENTS_IN_BUCK2_OUTPUT_PATH_BEFORE_PROJECT_PATH : -i + 1] ) # Visible for testing def should_scrub_with_focused_targets_output_paths( focused_targets_output_paths: Set[str], debug_file_path: str ) -> bool: # All paths to be scrubbed when no focused target is specified if len(focused_targets_output_paths) == 0: return True # debug_file_path usually have the format x/y/z/libFoo.a(bar.m.o) if "(" in debug_file_path: debug_target_path = debug_file_path.split("(")[0] else: debug_target_path = debug_file_path if debug_file_path.startswith("buck-out/"): target_output_path = _get_target_output_path_from_debug_file_path( debug_target_path ) return target_output_path not in focused_targets_output_paths else: # occasionally archive file can be directly from source. (package, name) = os.path.split(debug_file_path) while package != "": if f"{package}/__{name}__" in focused_targets_output_paths: return False (package, name) = os.path.split(package) return True def _should_scrub_with_targets_file( json_file_path: str, additional_labels: Set[str] ) -> Callable[[str], bool]: focused_targets_output_paths = load_focused_targets_output_paths(json_file_path) return lambda debug_file_path: should_scrub_with_focused_targets_output_paths( focused_targets_output_paths.union(additional_labels), debug_file_path ) def _should_scrub_with_spec_file( json_file_path: str, additional_labels: Set[str] ) -> Callable[[str], bool]: spec = Spec(json_file_path) return lambda debug_file_path: should_scrub_with_focused_targets_output_paths( additional_labels, debug_file_path ) and spec.scrub_debug_file_path(debug_file_path) def _scrub( f: BinaryIO, strtab_offset: int, symbols: List[Symbol], scrub_handler: Callable[[str], bool], ) -> List[Tuple[str, str]]: """ Return a list of tuples. Each tuple contains a pair of the original path and the rewritten path """ results = [] for symbol in symbols: f.seek(strtab_offset) f.seek(symbol.strtab_index, 1) # Read a byte at a time until we reach the end of the path, denoted by Hex 0. start = end = f.tell() path = b"" b = f.read(1) while b != b"\x00": path += b end += 1 b = f.read(1) str_len = end - start path_str = path.decode() if scrub_handler(path_str): f.seek(start) # We don't want to modify the length of the path, so pad the replacement # path with spaces buffer = FAKE_PATH + b" " * (str_len - len(FAKE_PATH)) f.write(buffer) results.append((path_str, buffer.decode())) else: results.append((path_str, path_str)) return results def scrub( input_file: str, output_file: str, persisted_targets_file: str, targets_file: Optional[str] = None, spec_file: Optional[str] = None, adhoc_codesign_tool: Optional[str] = None, ) -> List[Tuple[str, str]]: additional_labels = load_focused_targets_output_paths(persisted_targets_file) if targets_file and spec_file: raise Exception( "Only one of a targets file or spec file is supported, not both!" ) elif targets_file: scrub_handler = _should_scrub_with_targets_file(targets_file, additional_labels) elif spec_file: scrub_handler = _should_scrub_with_spec_file(spec_file, additional_labels) else: scrub_handler = _always_scrub shutil.copy2(input_file, output_file) # Make it RE-compatible make_path_user_writable(output_file) results = [] with open(output_file, "r+b") as f: header, offset = load_header(f, 0) if not header.is_valid: raise MachOException("Invalid macho format!") lc_linkedit, lc_symtab = load_commands(f, offset, header.n_cmds) if lc_linkedit is None: return [] if lc_symtab is None: raise MachOException("LC_SYMTAB command not found") if lc_symtab.strtab_size == 0 or lc_symtab.n_symbols == 0: return [] if lc_linkedit.file_size == 0: raise MachOException("LC_SEGMENT_64 command for string table not found") f.seek(lc_symtab.strtab_offset) """ ld64 deliberately burns the first byte with the space character, so that zero is never a valid string index and writes 0x00 at offset 1, so that it's always the empty string. The code for this in ld64 is in LinkEditClassic.hpp (StringPoolAtom::StringPoolAtom). """ if f.read(1) != b"\x20": raise MachOException("First character in the string table is not a space") if f.read(1) != b"\x00": raise MachOException("Second character in the string table is not a NUL") symbols = load_debug_symbols(f, lc_symtab.symtab_offset, lc_symtab.n_symbols) results = _scrub(f, lc_symtab.strtab_offset, symbols, scrub_handler) if adhoc_codesign_tool: subprocess.run( [adhoc_codesign_tool, "--binary", output_file], check=True, ) return results

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/systeminit/si'

If you have feedback or need assistance with the MCP directory API, please join our Discord server