From 439ae40a3821e540eeb62925883eb32cea59d3eb Mon Sep 17 00:00:00 2001 From: Kevin Phoenix Date: Thu, 26 Oct 2023 14:21:02 -0700 Subject: [PATCH] Add bindings to agent environment --- .gitignore | 1 + README.md | 12 + crates/bh_agent_client/src/bindings.rs | 2 +- crates/bh_agent_common/src/service.rs | 2 +- crates/bh_agent_server/Cargo.toml | 3 + crates/bh_agent_server/src/main.rs | 2 + crates/bh_agent_server/src/server.rs | 2 +- crates/bh_agent_server/src/state.rs | 75 ++++-- crates/bh_agent_server/src/util/read_chars.rs | 23 +- pyproject.toml | 10 + python/bh_agent_client.pyi | 31 +++ python/binharness/agentenvironment.py | 245 ++++++++++++++++++ 12 files changed, 381 insertions(+), 27 deletions(-) create mode 100644 python/bh_agent_client.pyi create mode 100644 python/binharness/agentenvironment.py diff --git a/.gitignore b/.gitignore index fc9b3aa..1c549a1 100644 --- a/.gitignore +++ b/.gitignore @@ -64,6 +64,7 @@ docs/_build/ # PyCharm .idea/ +*.iml # VSCode .vscode/ diff --git a/README.md b/README.md index e8e376c..2196760 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,15 @@ BinHarness === BinHarness is a framework to facilitate analyzing binary programs in various environments. + +## Project layout +``` +binharness +| crates: Main directory for rust code + | bh_agent_client: Client code for communicating with the binharness agent + | bh_agent_common: Shared code between the agent client and server + | bh_agent_server: The agent server program +| python: Main directory for python code + | binharness: Main python module + | tests: Test code +``` diff --git a/crates/bh_agent_client/src/bindings.rs b/crates/bh_agent_client/src/bindings.rs index 07f418e..825db6f 100644 --- a/crates/bh_agent_client/src/bindings.rs +++ b/crates/bh_agent_client/src/bindings.rs @@ -177,7 +177,7 @@ impl BhAgentClient { ) } - fn file_read(&self, env_id: EnvironmentId, fd: FileId, num_bytes: u32) -> PyResult> { + fn file_read(&self, env_id: EnvironmentId, fd: FileId, num_bytes: Option) -> PyResult> { run_in_runtime( self, self.client diff --git a/crates/bh_agent_common/src/service.rs b/crates/bh_agent_common/src/service.rs index 78c50aa..9af932e 100644 --- a/crates/bh_agent_common/src/service.rs +++ b/crates/bh_agent_common/src/service.rs @@ -45,7 +45,7 @@ pub trait BhAgentService { async fn file_read( env_id: EnvironmentId, fd: FileId, - num_bytes: u32, + num_bytes: Option, ) -> Result, AgentError>; async fn file_read_lines( diff --git a/crates/bh_agent_server/Cargo.toml b/crates/bh_agent_server/Cargo.toml index ac68d93..c808a18 100644 --- a/crates/bh_agent_server/Cargo.toml +++ b/crates/bh_agent_server/Cargo.toml @@ -13,3 +13,6 @@ tarpc = { version = "0.33.0", features = ["full"] } tokio = { version = "1.32.0", features = ["full"] } futures-util = "0.3.28" futures = "0.3.28" +log = "0.4.20" +env_logger = "0.10.0" +bimap = "0.6.3" diff --git a/crates/bh_agent_server/src/main.rs b/crates/bh_agent_server/src/main.rs index be840de..c0a6910 100644 --- a/crates/bh_agent_server/src/main.rs +++ b/crates/bh_agent_server/src/main.rs @@ -25,6 +25,8 @@ fn parse_args() -> Result<(IpAddr, u16)> { #[tokio::main] async fn main() -> anyhow::Result<()> { + env_logger::init(); + let server_addr = parse_args().or_else(|e| -> Result<(IpAddr, u16)> { eprintln!("{}", e); std::process::exit(1); diff --git a/crates/bh_agent_server/src/server.rs b/crates/bh_agent_server/src/server.rs index 05265ed..1973489 100644 --- a/crates/bh_agent_server/src/server.rs +++ b/crates/bh_agent_server/src/server.rs @@ -132,7 +132,7 @@ impl BhAgentService for BhAgentServer { _: Context, env_id: EnvironmentId, fd: FileId, - num_bytes: u32, + num_bytes: Option, ) -> Self::FileReadFut { check_env_id!(env_id); diff --git a/crates/bh_agent_server/src/state.rs b/crates/bh_agent_server/src/state.rs index 5f60ae9..ce4fb29 100644 --- a/crates/bh_agent_server/src/state.rs +++ b/crates/bh_agent_server/src/state.rs @@ -1,8 +1,10 @@ +use bimap::BiMap; use std::collections::HashMap; use std::ffi::OsStr; +use std::fmt::Display; use std::fs::{File, OpenOptions}; use std::sync::{Arc, RwLock}; -use futures_util::TryFutureExt; +use log::{debug, trace}; use subprocess::{Popen, PopenConfig}; @@ -20,9 +22,9 @@ pub struct BhAgentState { file_modes: RwLock>, file_types: RwLock>, processes: RwLock>>>, - proc_stdin_ids: RwLock>, - proc_stdout_ids: RwLock>, - proc_stderr_ids: RwLock>, + proc_stdin_ids: RwLock>, + proc_stdout_ids: RwLock>, + proc_stderr_ids: RwLock>, next_file_id: RwLock, next_process_id: RwLock, @@ -35,9 +37,9 @@ impl BhAgentState { file_modes: RwLock::new(HashMap::new()), file_types: RwLock::new(HashMap::new()), processes: RwLock::new(HashMap::new()), - proc_stdin_ids: RwLock::new(HashMap::new()), - proc_stdout_ids: RwLock::new(HashMap::new()), - proc_stderr_ids: RwLock::new(HashMap::new()), + proc_stdin_ids: RwLock::new(BiMap::new()), + proc_stdout_ids: RwLock::new(BiMap::new()), + proc_stderr_ids: RwLock::new(BiMap::new()), next_file_id: RwLock::new(0), next_process_id: RwLock::new(0), @@ -63,6 +65,7 @@ impl BhAgentState { fd: &FileId, modes: &Vec, ) -> Result { + trace!("Checking file {} for modes {:?}", fd, modes); Ok(modes.contains( self.file_modes .read()? @@ -72,6 +75,7 @@ impl BhAgentState { } pub fn file_type(&self, fd: &FileId) -> Result { + trace!("Getting file type for {}", fd); Ok(self .file_types .read()? @@ -154,16 +158,31 @@ impl BhAgentState { // Stick the process channels into the file map if proc.stdin.is_some() { + trace!("Saving stdin for process {}", proc_id); let file_id = self.take_file_id()?; - self.proc_stdin_ids.write()?.insert(file_id, proc_id); + self.proc_stdin_ids.write()?.insert(proc_id, file_id); + self.file_modes.write()?.insert(file_id, FileOpenMode::Write); + self.file_types.write()?.insert(file_id, FileOpenType::Binary); + } else { + trace!("Process {} has no stdin", proc_id); } if proc.stdout.is_some() { + trace!("Saving stdout for process {}", proc_id); let file_id = self.take_file_id()?; - self.proc_stdout_ids.write()?.insert(file_id, proc_id); + self.proc_stdout_ids.write()?.insert(proc_id, file_id); + self.file_modes.write()?.insert(file_id, FileOpenMode::Read); + self.file_types.write()?.insert(file_id, FileOpenType::Binary); + } else { + trace!("Process {} has no stdout", proc_id); } if proc.stderr.is_some() { + trace!("Saving stderr for process {}", proc_id); let file_id = self.take_file_id()?; - self.proc_stdout_ids.write()?.insert(file_id, proc_id); + self.proc_stdout_ids.write()?.insert(proc_id, file_id); + self.file_modes.write()?.insert(file_id, FileOpenMode::Read); + self.file_types.write()?.insert(file_id, FileOpenType::Binary); + } else { + trace!("Process {} has no stderr", proc_id); } // Move the proc to the process map @@ -179,18 +198,30 @@ impl BhAgentState { proc_id: &ProcessId, channel: ProcessChannel, ) -> Result { - match channel { + let channel_ids = match channel { ProcessChannel::Stdin => &self.proc_stdin_ids, ProcessChannel::Stdout => &self.proc_stdout_ids, ProcessChannel::Stderr => &self.proc_stderr_ids, - } + }; + + channel_ids .read()? - .get(&proc_id) + .get_by_left(&proc_id) .map(|i| i.clone()) - .ok_or(InvalidProcessId) + .ok_or((|| { + debug!("Failed to get process channel"); + debug!("Process ID: {}", proc_id); + debug!("Channel: {:?}", channel); + let proc_is_valid = self.processes.read().unwrap().contains_key(&proc_id); + debug!("Process is valid: {}", proc_is_valid); + debug!("Process with valid channels: {:?}", channel_ids.read().unwrap().left_values()); + + InvalidProcessId + })()) } pub fn close_file(&self, fd: &FileId) -> Result<(), AgentError> { + trace!("Closing file {}", fd); Ok(drop( self.files .write()? @@ -208,29 +239,39 @@ impl BhAgentState { fd: &FileId, op: impl Fn(&mut File) -> R, ) -> Result { + trace!("Doing mut operation on file {}", fd); + // Get file logic if let Some(file_lock) = self.files.read()?.get(fd) { return Ok(op(&mut *file_lock.write()?)); + } else { + trace!("File id map: {:?}", self.files.read()?); } // If these unwraps fail, the state is bad - if let Some(pid) = self.proc_stdin_ids.read()?.get(&fd) { + if let Some(pid) = self.proc_stdin_ids.read()?.get_by_right(&fd) { let procs_binding = self.processes.read()?; let mut proc_binding = procs_binding.get(pid).unwrap().write()?; let file = proc_binding.stdin.as_mut().unwrap(); return Ok(op(file)); + } else { + trace!("Process stdin id map: {:?}", self.proc_stdin_ids.read()?); } - if let Some(pid) = self.proc_stdout_ids.read()?.get(&fd) { + if let Some(pid) = self.proc_stdout_ids.read()?.get_by_right(&fd) { let procs_binding = self.processes.read()?; let mut proc_binding = procs_binding.get(pid).unwrap().write()?; let file = proc_binding.stdout.as_mut().unwrap(); return Ok(op(file)); + } else { + trace!("Process stdout id map: {:?}", self.proc_stdout_ids.read()?); } - if let Some(pid) = self.proc_stderr_ids.read()?.get(&fd) { + if let Some(pid) = self.proc_stderr_ids.read()?.get_by_right(&fd) { let procs_binding = self.processes.read()?; let mut proc_binding = procs_binding.get(pid).unwrap().write()?; let file = proc_binding.stderr.as_mut().unwrap(); return Ok(op(file)); + } else { + trace!("Process stderr id map: {:?}", self.proc_stderr_ids.read()?); } Err(InvalidFileDescriptor) diff --git a/crates/bh_agent_server/src/util/read_chars.rs b/crates/bh_agent_server/src/util/read_chars.rs index 2a077cf..34434d9 100644 --- a/crates/bh_agent_server/src/util/read_chars.rs +++ b/crates/bh_agent_server/src/util/read_chars.rs @@ -2,17 +2,26 @@ use std::fs::File; use std::io::Read; use anyhow::Result; +use log::trace; use bh_agent_common::FileOpenType; -pub fn read_generic(mut file: &File, n: u32, file_type: FileOpenType) -> Result> { - match file_type { - FileOpenType::Binary => { - let mut buffer = vec![0u8; n as usize]; - file.read(&mut buffer)?; - Ok(buffer) +pub fn read_generic(mut file: &File, n: Option, file_type: FileOpenType) -> Result> { + trace!("Entering read_generic"); + if let Some(num_bytes) = n { + match file_type { + FileOpenType::Binary => { + let mut buffer = vec![0u8; num_bytes as usize]; + file.read(&mut buffer)?; + Ok(buffer) + } + FileOpenType::Text => Ok(read_chars(&mut file, num_bytes as usize)?), } - FileOpenType::Text => Ok(read_chars(&mut file, n as usize)?), + } else { + // if n is None, we just read the whole file, text parsing happens on the client + let mut buffer = Vec::new(); + file.read_to_end(&mut buffer)?; + Ok(buffer) } } diff --git a/pyproject.toml b/pyproject.toml index 9c997eb..dff3362 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,10 +55,15 @@ ignore = [ "COM", "D203", "D213", + "FBT003", + "FIX", "S108", "S603", + "TD002", + "TD003", ] ignore-init-module-imports = true +line-length = 88 [tool.ruff.isort] required-imports = ["from __future__ import annotations"] @@ -68,6 +73,11 @@ required-imports = ["from __future__ import annotations"] "D", "S101", ] +"**/*.pyi" = [ + "ANN101", + "FBT", + "PLR0913", +] [tool.mypy] python_version = "3.8" diff --git a/python/bh_agent_client.pyi b/python/bh_agent_client.pyi new file mode 100644 index 0000000..c337349 --- /dev/null +++ b/python/bh_agent_client.pyi @@ -0,0 +1,31 @@ +class BhAgentClient: + @staticmethod + def initialize_client(ip_addr: str, port: int) -> BhAgentClient: ... + def get_environments(self) -> list[int]: ... + def get_tempdir(self, env_id: int) -> str: ... + def run_process( + self, + env_id: int, + argv: list[str], + stdin: bool, + stdout: bool, + stderr: bool, + executable: str | None, + env: dict[str, str] | None, + cwd: str | None, + setuid: int | None, + setgid: int | None, + setpgid: int | None, + ) -> int: ... + def get_process_channel(self, env_id: int, proc_id: int, channel: int) -> int: ... + def file_open(self, env_id: int, path: str, mode_and_type: str) -> int: ... + def file_close(self, env_id: int, fd: int) -> None: ... + def file_is_closed(self, env_id: int, fd: int) -> bool: ... + def file_is_readable(self, env_id: int, fd: int) -> bool: ... + def file_read(self, env_id: int, fd: int, size: int | None) -> bytes: ... + def file_read_lines(self, env_id: int, fd: int, hint: int) -> list[bytes]: ... + def file_is_seekable(self, env_id: int, fd: int) -> bool: ... + def file_seek(self, env_id: int, fd: int, offset: int, whence: int) -> int: ... + def file_tell(self, env_id: int, fd: int) -> int: ... + def file_is_writable(self, env_id: int, fd: int) -> bool: ... + def file_write(self, env_id: int, fd: int, data: bytes) -> int: ... diff --git a/python/binharness/agentenvironment.py b/python/binharness/agentenvironment.py new file mode 100644 index 0000000..f6268d5 --- /dev/null +++ b/python/binharness/agentenvironment.py @@ -0,0 +1,245 @@ +"""binharness.agentenvironment - AgentEnvironment class.""" +from __future__ import annotations + +from functools import cached_property +from pathlib import Path +from typing import Sequence + +from bh_agent_client import BhAgentClient + +from binharness.types.environment import Environment +from binharness.types.io import IO +from binharness.types.process import Process +from binharness.util import normalize_args + + +class AgentIO(IO[bytes]): + """AgentIO implements the IO interface for agents.""" + + # TODO: This doesn't do bytes/str right just yet. We need to be able to + # query the agent for the encoding of the file. + _client: BhAgentClient + _environment_id: int + _fd: int + + def __init__( + self: AgentIO, client: BhAgentClient, environment_id: int, fd: int + ) -> None: + """Create an AgentIO.""" + self._client = client + self._environment_id = environment_id + self._fd = fd + + def close(self: AgentIO) -> None: + """Close the file.""" + return self._client.file_close(self._environment_id, self._fd) + + @property + def closed(self: AgentIO) -> bool: + """Whether the file is closed.""" + return self._client.file_is_closed(self._environment_id, self._fd) + + def flush(self: AgentIO) -> None: + """Flush the file.""" + # TODO: Need to implement this in the agent protocol + + def read(self: AgentIO, n: int = -1) -> bytes: + """Read n bytes from the file.""" + return self._client.file_read( + self._environment_id, self._fd, None if n == -1 else n + ) + + def readable(self: AgentIO) -> bool: + """Whether the file is readable.""" + return self._client.file_is_readable(self._environment_id, self._fd) + + def readline(self: AgentIO, limit: int = -1) -> bytes: # noqa: ARG002 + """Read a line from the file.""" + lines = self._client.file_read_lines(self._environment_id, self._fd, 1) + if lines: + return lines[0] + return b"" # TODO: Verify this matches python stdlib behavior + + def readlines(self: AgentIO, hint: int = -1) -> list[bytes]: + """Read lines from the file.""" + return self._client.file_read_lines(self._environment_id, self._fd, hint) + + def seek(self: AgentIO, offset: int, whence: int = 0) -> int | None: + """Seek to a position in the file.""" + return self._client.file_seek(self._environment_id, self._fd, offset, whence) + + def seekable(self: AgentIO) -> bool: + """Whether the file is seekable.""" + return self._client.file_is_seekable(self._environment_id, self._fd) + + def tell(self: AgentIO) -> int: + """Get the current position in the file.""" + return self._client.file_tell(self._environment_id, self._fd) + + def writable(self: AgentIO) -> bool: + """Whether the file is writable.""" + return self._client.file_is_writable(self._environment_id, self._fd) + + def write(self: AgentIO, s: bytes) -> int | None: + """Write to the file.""" + return self._client.file_write(self._environment_id, self._fd, s) + + def writelines(self: AgentIO, lines: list[bytes]) -> None: + """Write lines to the file.""" + self._client.file_write(self._environment_id, self._fd, b"\n".join(lines)) + + +class AgentProcess(Process): + """A process running in an agent environment.""" + + _client: BhAgentClient + _env_id: int + _pid: int + + def __init__( # noqa: PLR0913 + self: AgentProcess, + client: BhAgentClient, + env_id: int, + pid: int, + environment: Environment, + args: Sequence[str], + env: dict[str, str] | None, + cwd: Path | None, + ) -> None: + """Create an AgentProcess.""" + super().__init__(environment, args, env, cwd) + self._client = client + self._env_id = env_id + self._pid = pid + + @cached_property + def stdin(self: AgentProcess) -> AgentIO: + """Get the standard input stream of the process.""" + fd = self._client.get_process_channel(self._env_id, self._pid, 0) + return AgentIO(self._client, self._env_id, fd) + + @cached_property + def stdout(self: AgentProcess) -> AgentIO: + """Get the standard output stream of the process.""" + fd = self._client.get_process_channel(self._env_id, self._pid, 1) + return AgentIO(self._client, self._env_id, fd) + + @cached_property + def stderr(self: AgentProcess) -> AgentIO: + """Get the standard error stream of the process.""" + fd = self._client.get_process_channel(self._env_id, self._pid, 2) + return AgentIO(self._client, self._env_id, fd) + + @property + def returncode(self: AgentProcess) -> int | None: + """Get the process' exit code.""" + # TODO: Need to implement this in the agent protocol + return None + + def poll(self: AgentProcess) -> int | None: + """Return the process' exit code if it has terminated, or None.""" + # TODO: Need to implement this in the agent protocol + return None + + def wait(self: AgentProcess, timeout: float | None = None) -> int: # noqa: ARG002 + """Wait for the process to terminate and return its exit code.""" + # TODO: Need to implement this in the agent protocol + return 0 + + +class AgentEnvironment(Environment): + """AgentEnvironment implements the Environment interface for agents.""" + + _client: BhAgentClient + _id: int + + def __init__(self: AgentEnvironment, client: BhAgentClient, id_: int) -> None: + """Create an AgentEnvironment.""" + self._client = client + self._id = id_ + + def run_command( + self: AgentEnvironment, + *args: Path | str | Sequence[Path | str], + env: dict[str, str] | None = None, + cwd: Path | None = None, + ) -> AgentProcess: + """Run a command in the environment.""" + normalized_args = list(normalize_args(*args)) + + pid = self._client.run_process( + env_id=self._id, + argv=normalized_args, + stdin=True, + stdout=True, + stderr=True, + executable=str(args[0]), + env=env, + cwd=str(cwd) if cwd else None, + setuid=None, + setgid=None, + setpgid=False, + ) + return AgentProcess( + self._client, + self._id, + pid, + self, + normalized_args, + env, + cwd, + ) + + def inject_files(self: AgentEnvironment, files: list[tuple[Path, Path]]) -> None: + """Inject files into the environment.""" + for src, dst in files: + fd = self._client.file_open(self._id, str(dst), "wb") + with src.open("rb") as f: + while chunk := f.read(4096): + self._client.file_write(self._id, fd, chunk) + self._client.file_close(self._id, fd) + + def retrieve_files(self: AgentEnvironment, files: list[tuple[Path, Path]]) -> None: + """Retrieve files from the environment.""" + for src, dst in files: + fd = self._client.file_open(self._id, str(src), "rb") + with dst.open("wb") as f: + while chunk := self._client.file_read(self._id, fd, 4096): + f.write(chunk) + + def get_tempdir(self: AgentEnvironment) -> Path: + """Get a Path for a temporary directory.""" + return Path(self._client.get_tempdir(self._id)) + + def open_file(self: AgentEnvironment, path: Path, mode: str) -> IO[bytes]: # type: ignore [override] + """Open a file in the environment. Follows the same semantics as `open`.""" + # TODO: Need to better handle mode/typing here + fd = self._client.file_open(self._id, str(path), mode) + return AgentIO(self._client, self._id, fd) + + +class AgentConnection: + """AgentConnection represents a connection to an agent. + + It serves as the main interface for interacting with the agent client. + """ + + _client: BhAgentClient + _env_cache: dict[int, AgentEnvironment] + + def __init__(self: AgentConnection, host: str, port: int) -> None: + """Create an AgentConnection.""" + self._client = BhAgentClient.initialize_client(host, port) + self._env_cache = {} + + def get_environment_ids(self: AgentConnection) -> list[int]: + """Get a list of environment IDs that are currently active on the agent.""" + return self._client.get_environments() + + def get_environment(self: AgentConnection, id_: int) -> AgentEnvironment: + """Get an AgentEnvironment for the given environment ID.""" + if id_ not in self._env_cache: + new_env = AgentEnvironment(self._client, id_) + self._env_cache[id_] = new_env + return new_env + return self._env_cache[id_]